如何在 Web 抓取大量数据时摆脱连接错误?
How to get rid off ConnectionError during Webscraping for huge data?
我正在尝试从 link 的搜索结果中抓取数据。我想抓取大约 2099 个项目。但是当我 运行 我的脚本然后我得到 ConnectionError 但我只能得到大约 130 个项目。我知道有些网站在特定时间只允许很少的请求,但可以用
完成
try:
code
except:
time.sleep(60)
continue
但在我的情况下它不起作用。
这是我使用 JSON 技术的所有代码。
import pandas as pd
import requests
import time
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',}
url="https://www2.daad.de/deutschland/studienangebote/international-programmes/api/solr/en/search.json?cert=&admReq=&scholarshipLC=&scholarshipSC=&langDeAvailable=&langEnAvailable=&sort=4&q=&limit=1&offset={}&display=list&isSep="
listofanchor=[]
courses=[]
academy = []
city = []
languages = []
applicationDeadline = []
programmeDuration = []
beginning = []
subject = []
tuitionFees = []
image = []
dateString = []
pagenum=0
while pagenum<2100:
finalUrl=url.format(pagenum)
res=requests.get(finalUrl,headers=headers).json()
for item in res['courses']:
listofanchor.append(item['link'])
courses.append(item['courseName'])
print(courses)
academy.append(item['academy'])
city.append(item['city'])
languages.append(item['languages'])
applicationDeadline.append(item['applicationDeadline'])
programmeDuration.append(item['programmeDuration'])
beginning.append(item['beginning'])
subject.append(item['subject'])
tuitionFees.append(item['tuitionFees'])
image.append(item['image'])
dateString.append(item['dateString'])
pagenum=pagenum+10
df = pd.DataFrame({"link":listofanchor,"courseName":courses, "academy":academy, "city":city, "languages":languages, "programmeDuration":programmeDuration,
"beginning":beginning, "subject":subject, "tuitionFees": tuitionFees, 'image': image, "dateString":dateString})
或者还有其他机会从 this 搜索结果中抓取数据吗?
import requests
import pandas as pd
r = requests.get("https://www2.daad.de/deutschland/studienangebote/international-programmes/api/solr/en/search.json?cert=&admReq=&scholarshipLC=&scholarshipSC=&fos=&langDeAvailable=&langEnAvailable=&sort=4&q=&limit=2099&offset=&display=list&isSep=").json()
data = ["link", "courseName", "academy",
"city", "languages", "programmeDuration", "beginning", "subject", "tuitionFees", "image", "dateString"]
d = []
for item in r["courses"]:
d.append([item[x] for x in data])
df = pd.DataFrame(*[d], columns=data)
df.to_csv("data.csv", index=False)
我正在尝试从 link 的搜索结果中抓取数据。我想抓取大约 2099 个项目。但是当我 运行 我的脚本然后我得到 ConnectionError 但我只能得到大约 130 个项目。我知道有些网站在特定时间只允许很少的请求,但可以用
完成try:
code
except:
time.sleep(60)
continue
但在我的情况下它不起作用。
这是我使用 JSON 技术的所有代码。
import pandas as pd
import requests
import time
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',}
url="https://www2.daad.de/deutschland/studienangebote/international-programmes/api/solr/en/search.json?cert=&admReq=&scholarshipLC=&scholarshipSC=&langDeAvailable=&langEnAvailable=&sort=4&q=&limit=1&offset={}&display=list&isSep="
listofanchor=[]
courses=[]
academy = []
city = []
languages = []
applicationDeadline = []
programmeDuration = []
beginning = []
subject = []
tuitionFees = []
image = []
dateString = []
pagenum=0
while pagenum<2100:
finalUrl=url.format(pagenum)
res=requests.get(finalUrl,headers=headers).json()
for item in res['courses']:
listofanchor.append(item['link'])
courses.append(item['courseName'])
print(courses)
academy.append(item['academy'])
city.append(item['city'])
languages.append(item['languages'])
applicationDeadline.append(item['applicationDeadline'])
programmeDuration.append(item['programmeDuration'])
beginning.append(item['beginning'])
subject.append(item['subject'])
tuitionFees.append(item['tuitionFees'])
image.append(item['image'])
dateString.append(item['dateString'])
pagenum=pagenum+10
df = pd.DataFrame({"link":listofanchor,"courseName":courses, "academy":academy, "city":city, "languages":languages, "programmeDuration":programmeDuration,
"beginning":beginning, "subject":subject, "tuitionFees": tuitionFees, 'image': image, "dateString":dateString})
或者还有其他机会从 this 搜索结果中抓取数据吗?
import requests
import pandas as pd
r = requests.get("https://www2.daad.de/deutschland/studienangebote/international-programmes/api/solr/en/search.json?cert=&admReq=&scholarshipLC=&scholarshipSC=&fos=&langDeAvailable=&langEnAvailable=&sort=4&q=&limit=2099&offset=&display=list&isSep=").json()
data = ["link", "courseName", "academy",
"city", "languages", "programmeDuration", "beginning", "subject", "tuitionFees", "image", "dateString"]
d = []
for item in r["courses"]:
d.append([item[x] for x in data])
df = pd.DataFrame(*[d], columns=data)
df.to_csv("data.csv", index=False)