python 网络爬虫上的 500 状态错误 cubetutor(神奇的收集网站)
500 status error on python web crawler for cubetutor (magic the gathering site)
这是我的代码:
import requests
from bs4 import BeautifulSoup as bs4
cookies = {
'JSESSIONID': '15EA1C17E103E8206BAFFF73FA157231',
}
headers = {
'Pragma': 'no-cache',
'Origin': 'http://www.cubetutor.com',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*',
'Cache-Control': 'no-cache',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'http://www.cubetutor.com/topcardsbyset/1;jsessionid=15EA1C17E103E8206BAFFF73FA157231',
}
data = {
't:ac': '1',
't:formdata': 'byjWim5rtLJcD8P4BWZe94Mn6II=:H4sIAAAAAAAAAJWOPQ4BQRSAH4lCdBIRPe1oaKiQqEQkywHezj5rZOxM5j1/l3ECcQmFzh0cQKtS2DiARPsV3/edn1DaN6A+d36EIeHhMSLpMQmTJS0coOtCqtCjXpES9MQSjl2lXSBrYhUjkxrEOUQtY0M2aeaCrW8trpVH7fYuQmECFe0yCc5OcUMC1ckad9i2mKXtSILJ0v7BC5TzaPSN/v4Z/PszC04Tc7SNN4bZuOx6STrL1+leBDj4DxyMK0MBAQAA',
'setSelect': '10E',
't:zoneid': 'topCardsZone'
}
response = bs4(requests.post('http://www.cubetutor.com/topcardsbyset.topcardsform', headers=headers, cookies=cookies, data=data).json()['content'], 'lxml')
text = response.text.split("\n")
file = text.pop(0)
link = response.find_all("a")
arr = {file : {}}
for i in range(len(link)):
arr[file][text[i]] = link[i]
print(file)
for i in arr[file]:
print(i," : ", arr[file][i])
text = response.text.split("\n") 上的所有内容都不重要(未完成,也不是问题所在)。上面的代码一切正常,但它要求我每次都去网站查看网络 requests/headers 并获取所有这些信息。我正在尝试稍微自动化该过程,但第二次我尝试创建自己的 jsession id,这样我就不必继续访问该站点,它会给我一个 500 状态错误。起初我认为这可能是因为 jsession id 和 headers 不匹配所以我将其更改为也等于 session.headers 如下所示,但这并没有解决任何问题。如果您想查看其他提交,代码位于 https://github.com/icarus612/spiderPY-magicTG/。任何帮助都会很棒。
import requests
from bs4 import BeautifulSoup as soup
data = {
't:ac': '1',
't:formdata': 'byjWim5rtLJcD8P4BWZe94Mn6II=:H4sIAAAAAAAAAJWOPQ4BQRSAH4lCdBIRPe1oaKiQqEQkywHezj5rZOxM5j1/l3ECcQmFzh0cQKtS2DiARPsV3/edn1DaN6A+d36EIeHhMSLpMQmTJS0coOtCqtCjXpES9MQSjl2lXSBrYhUjkxrEOUQtY0M2aeaCrW8trpVH7fYuQmECFe0yCc5OcUMC1ckad9i2mKXtSILJ0v7BC5TzaPSN/v4Z/PszC04Tc7SNN4bZuOx6STrL1+leBDj4DxyMK0MBAQAA',
'setSelect': '10E',
't:zoneid': 'topCardsZone'
}
session = requests.Session()
r = session.get('http://www.cubetutor.com')
cookies = session.cookies.get_dict()
headers = session.headers
headers['Referer'] = f"http://www.cubetutor.com/topcardsbyset/1;jsessionid={cookies['JSESSIONID']}"
headers['Pragma'] = 'no-cache'
headers['Origin'] = 'http://www.cubetutor.com'
headers['X-Requested-With'] = 'XMLHttpRequest'
print(r)
response = session.get('http://www.cubetutor.com/topcardsbyset.topcardsform', headers=headers, data=data, cookies=cookies)
print(response)
#text = response.find_all(class_='compareCubeColumn').text
#link = response.find_all("a")
这似乎有效(不包括 bs4
解析):
import requests
def get_session_cookie():
resp = requests.get("https://www.cubetutor.com")
if resp.ok and resp.cookies:
return resp.cookies.get_dict()
def post_query():
session_cookie = get_session_cookie()
headers = {
'Pragma': 'no-cache',
'Origin': 'https://www.cubetutor.com',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*',
'Cache-Control': 'no-cache',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'https://www.cubetutor.com/topcardsbyset/1;jsessionid=15EA1C17E103E8206BAFFF73FA157231',
}
data = {
't:ac': '1',
't:formdata': 'byjWim5rtLJcD8P4BWZe94Mn6II=:H4sIAAAAAAAAAJWOPQ4BQRSAH4lCdBIRPe1oaKiQqEQkywHezj5rZOxM5j1/l3ECcQmFzh0cQKtS2DiARPsV3/edn1DaN6A+d36EIeHhMSLpMQmTJS0coOtCqtCjXpES9MQSjl2lXSBrYhUjkxrEOUQtY0M2aeaCrW8trpVH7fYuQmECFe0yCc5OcUMC1ckad9i2mKXtSILJ0v7BC5TzaPSN/v4Z/PszC04Tc7SNN4bZuOx6STrL1+leBDj4DxyMK0MBAQAA',
'setSelect': '10E',
't:zoneid': 'topCardsZone'
}
return requests.post('https://www.cubetutor.com/topcardsbyset.topcardsform', headers=headers, cookies=session_cookie, data=data).json()
# resp = post_query()
# print(resp)
# {u'content': u"<div class='centeredContainer'><div class='compa ...
这是我的代码:
import requests
from bs4 import BeautifulSoup as bs4
cookies = {
'JSESSIONID': '15EA1C17E103E8206BAFFF73FA157231',
}
headers = {
'Pragma': 'no-cache',
'Origin': 'http://www.cubetutor.com',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*',
'Cache-Control': 'no-cache',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'http://www.cubetutor.com/topcardsbyset/1;jsessionid=15EA1C17E103E8206BAFFF73FA157231',
}
data = {
't:ac': '1',
't:formdata': 'byjWim5rtLJcD8P4BWZe94Mn6II=:H4sIAAAAAAAAAJWOPQ4BQRSAH4lCdBIRPe1oaKiQqEQkywHezj5rZOxM5j1/l3ECcQmFzh0cQKtS2DiARPsV3/edn1DaN6A+d36EIeHhMSLpMQmTJS0coOtCqtCjXpES9MQSjl2lXSBrYhUjkxrEOUQtY0M2aeaCrW8trpVH7fYuQmECFe0yCc5OcUMC1ckad9i2mKXtSILJ0v7BC5TzaPSN/v4Z/PszC04Tc7SNN4bZuOx6STrL1+leBDj4DxyMK0MBAQAA',
'setSelect': '10E',
't:zoneid': 'topCardsZone'
}
response = bs4(requests.post('http://www.cubetutor.com/topcardsbyset.topcardsform', headers=headers, cookies=cookies, data=data).json()['content'], 'lxml')
text = response.text.split("\n")
file = text.pop(0)
link = response.find_all("a")
arr = {file : {}}
for i in range(len(link)):
arr[file][text[i]] = link[i]
print(file)
for i in arr[file]:
print(i," : ", arr[file][i])
text = response.text.split("\n") 上的所有内容都不重要(未完成,也不是问题所在)。上面的代码一切正常,但它要求我每次都去网站查看网络 requests/headers 并获取所有这些信息。我正在尝试稍微自动化该过程,但第二次我尝试创建自己的 jsession id,这样我就不必继续访问该站点,它会给我一个 500 状态错误。起初我认为这可能是因为 jsession id 和 headers 不匹配所以我将其更改为也等于 session.headers 如下所示,但这并没有解决任何问题。如果您想查看其他提交,代码位于 https://github.com/icarus612/spiderPY-magicTG/。任何帮助都会很棒。
import requests
from bs4 import BeautifulSoup as soup
data = {
't:ac': '1',
't:formdata': 'byjWim5rtLJcD8P4BWZe94Mn6II=:H4sIAAAAAAAAAJWOPQ4BQRSAH4lCdBIRPe1oaKiQqEQkywHezj5rZOxM5j1/l3ECcQmFzh0cQKtS2DiARPsV3/edn1DaN6A+d36EIeHhMSLpMQmTJS0coOtCqtCjXpES9MQSjl2lXSBrYhUjkxrEOUQtY0M2aeaCrW8trpVH7fYuQmECFe0yCc5OcUMC1ckad9i2mKXtSILJ0v7BC5TzaPSN/v4Z/PszC04Tc7SNN4bZuOx6STrL1+leBDj4DxyMK0MBAQAA',
'setSelect': '10E',
't:zoneid': 'topCardsZone'
}
session = requests.Session()
r = session.get('http://www.cubetutor.com')
cookies = session.cookies.get_dict()
headers = session.headers
headers['Referer'] = f"http://www.cubetutor.com/topcardsbyset/1;jsessionid={cookies['JSESSIONID']}"
headers['Pragma'] = 'no-cache'
headers['Origin'] = 'http://www.cubetutor.com'
headers['X-Requested-With'] = 'XMLHttpRequest'
print(r)
response = session.get('http://www.cubetutor.com/topcardsbyset.topcardsform', headers=headers, data=data, cookies=cookies)
print(response)
#text = response.find_all(class_='compareCubeColumn').text
#link = response.find_all("a")
这似乎有效(不包括 bs4
解析):
import requests
def get_session_cookie():
resp = requests.get("https://www.cubetutor.com")
if resp.ok and resp.cookies:
return resp.cookies.get_dict()
def post_query():
session_cookie = get_session_cookie()
headers = {
'Pragma': 'no-cache',
'Origin': 'https://www.cubetutor.com',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*',
'Cache-Control': 'no-cache',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'https://www.cubetutor.com/topcardsbyset/1;jsessionid=15EA1C17E103E8206BAFFF73FA157231',
}
data = {
't:ac': '1',
't:formdata': 'byjWim5rtLJcD8P4BWZe94Mn6II=:H4sIAAAAAAAAAJWOPQ4BQRSAH4lCdBIRPe1oaKiQqEQkywHezj5rZOxM5j1/l3ECcQmFzh0cQKtS2DiARPsV3/edn1DaN6A+d36EIeHhMSLpMQmTJS0coOtCqtCjXpES9MQSjl2lXSBrYhUjkxrEOUQtY0M2aeaCrW8trpVH7fYuQmECFe0yCc5OcUMC1ckad9i2mKXtSILJ0v7BC5TzaPSN/v4Z/PszC04Tc7SNN4bZuOx6STrL1+leBDj4DxyMK0MBAQAA',
'setSelect': '10E',
't:zoneid': 'topCardsZone'
}
return requests.post('https://www.cubetutor.com/topcardsbyset.topcardsform', headers=headers, cookies=session_cookie, data=data).json()
# resp = post_query()
# print(resp)
# {u'content': u"<div class='centeredContainer'><div class='compa ...