为什么我无法在 Python 中抓取此 link?
Why can't I crawl this link in Python?
我正在尝试抓取网页内容,但我不明白为什么会出现此错误:http.client.IncompleteRead: IncompleteRead(2268 bytes read, 612 more expected)
这是 link 我正在尝试抓取 : www.rc2.vd.ch
这是我用来抓取的 Python 代码:
import requests
from bs4 import BeautifulSoup
def spider_list():
url = 'http://www.rc2.vd.ch/registres/hrcintapp-pub/companySearch.action?lang=FR&init=false&advancedMode=false&printMode=false&ofpCriteria=N&actualDate=18.08.2015&rowMin=0&rowMax=0&listSize=0&go=none&showHeader=false&companyName=&companyNameSearchType=CONTAIN&companyOfsUid=&companyOfrcId13Part1=&companyOfrcId13Part2=&companyOfrcId13Part3=&limitResultCompanyActive=ACTIVE&searchRows=51&resultFormat=STD_COMP_NAME&display=Rechercher#result'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'hoverable'}):
print(link)
spider_list()
我试过另一个网站 link,它工作正常,但为什么我不能抓取这个网站?
如果无法使用此代码执行此操作,那我该怎么做?
------------编辑------------
这是完整的错误信息:
Traceback (most recent call last):
File "C:/Users/Nuriddin/PycharmProjects/project/a.py", line 19, in <module>
spider_list()
File "C:/Users/Nuriddin/PycharmProjects/project/a.py", line 12, in spider_list
source_code = requests.get(url)
File "C:\Python34\lib\site-packages\requests\api.py", line 69, in get
return request('get', url, params=params, **kwargs)
File "C:\Python34\lib\site-packages\requests\api.py", line 50, in request
response = session.request(method=method, url=url, **kwargs)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 465, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 605, in send
r.content
File "C:\Python34\lib\site-packages\requests\models.py", line 750, in content
self._content = bytes().join(self.iter_content(CONTENT_CHUNK_SIZE)) or bytes()
File "C:\Python34\lib\site-packages\requests\models.py", line 673, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "C:\Python34\lib\site-packages\requests\packages\urllib3\response.py", line 303, in stream
for line in self.read_chunked(amt, decode_content=decode_content):
File "C:\Python34\lib\site-packages\requests\packages\urllib3\response.py", line 450, in read_chunked
chunk = self._handle_chunk(amt)
File "C:\Python34\lib\site-packages\requests\packages\urllib3\response.py", line 420, in _handle_chunk
returned_chunk = self._fp._safe_read(self.chunk_left)
File "C:\Python34\lib\http\client.py", line 664, in _safe_read
raise IncompleteRead(b''.join(s), amt)
http.client.IncompleteRead: IncompleteRead(4485 bytes read, 628 more expected)
您的编辑器可能有问题。
我在 python 3 中得到 正确的结果 ,你的代码在 IDLE
.
下面附上图片供参考-
我唯一能想到的就是以某种方式绕过错误:
import requests
from bs4 import BeautifulSoup
def spider_list():
url = 'http://www.rc2.vd.ch/registres/hrcintapp-pub/companySearch.action?lang=FR&init=false&advancedMode=false&printMode=false&ofpCriteria=N&actualDate=18.08.2015&rowMin=0&rowMax=0&listSize=0&go=none&showHeader=false&companyName=&companyNameSearchType=CONTAIN&companyOfsUid=&companyOfrcId13Part1=&companyOfrcId13Part2=&companyOfrcId13Part3=&limitResultCompanyActive=ACTIVE&searchRows=51&resultFormat=STD_COMP_NAME&display=Rechercher#result'
try:
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'hoverable'}):
print(link)
except:
pass
#I am passing but you do whatever you want to do in case of error
spider_list()
如果有帮助请告诉我。
这个怎么样!!
import requests
from lxml.html import fromstring
url = 'https://www.rc2.vd.ch/registres/hrcintapp-pub/companySearch.action?lang=FR&init=false&advancedMode=false&printMode=false&ofpCriteria=N&actualDate=18.08.2015&rowMin=0&rowMax=0&listSize=0&go=none&showHeader=false&companyName=&companyNameSearchType=CONTAIN&companyOfsUid=&companyOfrcId13Part1=&companyOfrcId13Part2=&companyOfrcId13Part3=&limitResultCompanyActive=ACTIVE&searchRows=51&resultFormat=STD_COMP_NAME&display=Rechercher#result'
def spider_list(link):
code = requests.get(link)
tree = fromstring(code.text)
skim = tree.xpath('//a[@class="hoverable"]/@href')
print(skim)
if __name__ == '__main__':
spider_list(url)
我正在尝试抓取网页内容,但我不明白为什么会出现此错误:http.client.IncompleteRead: IncompleteRead(2268 bytes read, 612 more expected)
这是 link 我正在尝试抓取 : www.rc2.vd.ch
这是我用来抓取的 Python 代码:
import requests
from bs4 import BeautifulSoup
def spider_list():
url = 'http://www.rc2.vd.ch/registres/hrcintapp-pub/companySearch.action?lang=FR&init=false&advancedMode=false&printMode=false&ofpCriteria=N&actualDate=18.08.2015&rowMin=0&rowMax=0&listSize=0&go=none&showHeader=false&companyName=&companyNameSearchType=CONTAIN&companyOfsUid=&companyOfrcId13Part1=&companyOfrcId13Part2=&companyOfrcId13Part3=&limitResultCompanyActive=ACTIVE&searchRows=51&resultFormat=STD_COMP_NAME&display=Rechercher#result'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'hoverable'}):
print(link)
spider_list()
我试过另一个网站 link,它工作正常,但为什么我不能抓取这个网站?
如果无法使用此代码执行此操作,那我该怎么做?
------------编辑------------
这是完整的错误信息:
Traceback (most recent call last):
File "C:/Users/Nuriddin/PycharmProjects/project/a.py", line 19, in <module>
spider_list()
File "C:/Users/Nuriddin/PycharmProjects/project/a.py", line 12, in spider_list
source_code = requests.get(url)
File "C:\Python34\lib\site-packages\requests\api.py", line 69, in get
return request('get', url, params=params, **kwargs)
File "C:\Python34\lib\site-packages\requests\api.py", line 50, in request
response = session.request(method=method, url=url, **kwargs)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 465, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 605, in send
r.content
File "C:\Python34\lib\site-packages\requests\models.py", line 750, in content
self._content = bytes().join(self.iter_content(CONTENT_CHUNK_SIZE)) or bytes()
File "C:\Python34\lib\site-packages\requests\models.py", line 673, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "C:\Python34\lib\site-packages\requests\packages\urllib3\response.py", line 303, in stream
for line in self.read_chunked(amt, decode_content=decode_content):
File "C:\Python34\lib\site-packages\requests\packages\urllib3\response.py", line 450, in read_chunked
chunk = self._handle_chunk(amt)
File "C:\Python34\lib\site-packages\requests\packages\urllib3\response.py", line 420, in _handle_chunk
returned_chunk = self._fp._safe_read(self.chunk_left)
File "C:\Python34\lib\http\client.py", line 664, in _safe_read
raise IncompleteRead(b''.join(s), amt)
http.client.IncompleteRead: IncompleteRead(4485 bytes read, 628 more expected)
您的编辑器可能有问题。
我在 python 3 中得到 正确的结果 ,你的代码在 IDLE
.
下面附上图片供参考-
我唯一能想到的就是以某种方式绕过错误:
import requests
from bs4 import BeautifulSoup
def spider_list():
url = 'http://www.rc2.vd.ch/registres/hrcintapp-pub/companySearch.action?lang=FR&init=false&advancedMode=false&printMode=false&ofpCriteria=N&actualDate=18.08.2015&rowMin=0&rowMax=0&listSize=0&go=none&showHeader=false&companyName=&companyNameSearchType=CONTAIN&companyOfsUid=&companyOfrcId13Part1=&companyOfrcId13Part2=&companyOfrcId13Part3=&limitResultCompanyActive=ACTIVE&searchRows=51&resultFormat=STD_COMP_NAME&display=Rechercher#result'
try:
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {'class': 'hoverable'}):
print(link)
except:
pass
#I am passing but you do whatever you want to do in case of error
spider_list()
如果有帮助请告诉我。
这个怎么样!!
import requests
from lxml.html import fromstring
url = 'https://www.rc2.vd.ch/registres/hrcintapp-pub/companySearch.action?lang=FR&init=false&advancedMode=false&printMode=false&ofpCriteria=N&actualDate=18.08.2015&rowMin=0&rowMax=0&listSize=0&go=none&showHeader=false&companyName=&companyNameSearchType=CONTAIN&companyOfsUid=&companyOfrcId13Part1=&companyOfrcId13Part2=&companyOfrcId13Part3=&limitResultCompanyActive=ACTIVE&searchRows=51&resultFormat=STD_COMP_NAME&display=Rechercher#result'
def spider_list(link):
code = requests.get(link)
tree = fromstring(code.text)
skim = tree.xpath('//a[@class="hoverable"]/@href')
print(skim)
if __name__ == '__main__':
spider_list(url)