Python 使用请求进行网络抓取 - 在响应中只获得了一小部分数据
Python web scraping with requests - Got only a small part of data in the response
我正在尝试从中获取一些财务数据 url:
http://www.casablanca-bourse.com/bourseweb/en/Negociation-History.aspx?Cat=24&IdLink=225
我的代码仅适用于非常小的日期间隔(少于 19 天),但在网站中我们可以获取 3 年的数据!.
我的代码如下:
import requests
import string
import csv
from bs4 import BeautifulSoup
# a simple helper function
def formatIt(s) :
output = ''
for i in s :
if i in string.printable :
output += i
return output
# default url
uri = "http://www.casablanca-bourse.com/bourseweb/en/Negociation-History.aspx?Cat=24&IdLink=225"
def get_viewState_and_symVal (symbolName, session) :
#session = requests.Session()
r = session.get(uri)
soup = BeautifulSoup(r.content) #soup = BeautifulSoup(r.text)
# let's get the viewstate value
viewstate_val = soup.find('input', attrs = {"id" : "__VIEWSTATE"})['value']
# let's get the symbol value
selectSymb = soup.find('select', attrs = {"name" : "HistoriqueNegociation1$HistValeur1$DDValeur"})
for i in selectSymb.find_all('option') :
if i.text == symbolName :
symbol_val = i['value']
# simple sanity check before return !
try :
symbol_val
except :
raise NameError ("Symbol Name not found !!!")
else :
return (viewstate_val, symbol_val)
def MainFun (symbolName, dateFrom, dateTo) :
session = requests.Session()
request1 = get_viewState_and_symVal (symbolName, session)
viewstate = request1[0]
symbol = request1[1]
payload = {
'TopControl1$ScriptManager1' : r'HistoriqueNegociation1$UpdatePanel1|HistoriqueNegociation1$HistValeur1$Image1',
'__VIEWSTATE' : viewstate,
'HistoriqueNegociation1$HistValeur1$DDValeur' : symbol,
'HistoriqueNegociation1$HistValeur1$historique' : r'RBSearchDate',
'HistoriqueNegociation1$HistValeur1$DateTimeControl1$TBCalendar' : dateFrom,
'HistoriqueNegociation1$HistValeur1$DateTimeControl2$TBCalendar' : dateTo,
'HistoriqueNegociation1$HistValeur1$DDuree' : r'6',
'hiddenInputToUpdateATBuffer_CommonToolkitScripts' : r'1',
'HistoriqueNegociation1$HistValeur1$Image1.x' : r'27',
'HistoriqueNegociation1$HistValeur1$Image1.y' : r'8'
}
request2 = session.post(uri, data = payload)
soup2 = BeautifulSoup(request2.content)
ops = soup2.find_all('table', id = "arial11bleu")
for i in ops :
try :
i['class']
except :
rslt = i
break
output = []
for i in rslt.find_all('tr')[1:] :
temp = []
for j in i.find_all('td') :
sani = j.text.strip()
if not sani in string.whitespace :
temp.append(formatIt(sani))
if len(temp) > 0 :
output.append(temp)
with open("output.csv", "wb") as f :
writer = csv.writer(f, delimiter = ';')
writer.writerows(output)
return writer
# working example
MainFun ("ATLANTA", "1/1/2014", "30/01/2014")
# not working example
MainFun ("ATLANTA", "1/1/2014", "30/03/2014")
可能是站点自动检测抓取程序并阻止了您。尝试在某处添加一个小的 sleep
语句,让他们的服务器有时间喘息。无论如何,这通常是一种礼貌的做法。
from time import sleep
sleep(1) # pauses 1 second
我的 windows 环境好像有问题。该代码在基于 debian 的虚拟机和 python virtualenv 下运行良好。
我正在尝试从中获取一些财务数据 url:
http://www.casablanca-bourse.com/bourseweb/en/Negociation-History.aspx?Cat=24&IdLink=225
我的代码仅适用于非常小的日期间隔(少于 19 天),但在网站中我们可以获取 3 年的数据!.
我的代码如下:
import requests
import string
import csv
from bs4 import BeautifulSoup
# a simple helper function
def formatIt(s) :
output = ''
for i in s :
if i in string.printable :
output += i
return output
# default url
uri = "http://www.casablanca-bourse.com/bourseweb/en/Negociation-History.aspx?Cat=24&IdLink=225"
def get_viewState_and_symVal (symbolName, session) :
#session = requests.Session()
r = session.get(uri)
soup = BeautifulSoup(r.content) #soup = BeautifulSoup(r.text)
# let's get the viewstate value
viewstate_val = soup.find('input', attrs = {"id" : "__VIEWSTATE"})['value']
# let's get the symbol value
selectSymb = soup.find('select', attrs = {"name" : "HistoriqueNegociation1$HistValeur1$DDValeur"})
for i in selectSymb.find_all('option') :
if i.text == symbolName :
symbol_val = i['value']
# simple sanity check before return !
try :
symbol_val
except :
raise NameError ("Symbol Name not found !!!")
else :
return (viewstate_val, symbol_val)
def MainFun (symbolName, dateFrom, dateTo) :
session = requests.Session()
request1 = get_viewState_and_symVal (symbolName, session)
viewstate = request1[0]
symbol = request1[1]
payload = {
'TopControl1$ScriptManager1' : r'HistoriqueNegociation1$UpdatePanel1|HistoriqueNegociation1$HistValeur1$Image1',
'__VIEWSTATE' : viewstate,
'HistoriqueNegociation1$HistValeur1$DDValeur' : symbol,
'HistoriqueNegociation1$HistValeur1$historique' : r'RBSearchDate',
'HistoriqueNegociation1$HistValeur1$DateTimeControl1$TBCalendar' : dateFrom,
'HistoriqueNegociation1$HistValeur1$DateTimeControl2$TBCalendar' : dateTo,
'HistoriqueNegociation1$HistValeur1$DDuree' : r'6',
'hiddenInputToUpdateATBuffer_CommonToolkitScripts' : r'1',
'HistoriqueNegociation1$HistValeur1$Image1.x' : r'27',
'HistoriqueNegociation1$HistValeur1$Image1.y' : r'8'
}
request2 = session.post(uri, data = payload)
soup2 = BeautifulSoup(request2.content)
ops = soup2.find_all('table', id = "arial11bleu")
for i in ops :
try :
i['class']
except :
rslt = i
break
output = []
for i in rslt.find_all('tr')[1:] :
temp = []
for j in i.find_all('td') :
sani = j.text.strip()
if not sani in string.whitespace :
temp.append(formatIt(sani))
if len(temp) > 0 :
output.append(temp)
with open("output.csv", "wb") as f :
writer = csv.writer(f, delimiter = ';')
writer.writerows(output)
return writer
# working example
MainFun ("ATLANTA", "1/1/2014", "30/01/2014")
# not working example
MainFun ("ATLANTA", "1/1/2014", "30/03/2014")
可能是站点自动检测抓取程序并阻止了您。尝试在某处添加一个小的 sleep
语句,让他们的服务器有时间喘息。无论如何,这通常是一种礼貌的做法。
from time import sleep
sleep(1) # pauses 1 second
我的 windows 环境好像有问题。该代码在基于 debian 的虚拟机和 python virtualenv 下运行良好。