无法抓取多个网址
Unable to crawl multiples URLs
我有两个抓取网页并查找特定 class 并在其中找到 href 标签的功能。
url="https://www.poynter.org/ifcn-covid-19-misinformation/page/220/"
def url_parse(site):
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
return soup
def article_link(URL):
try:
soup=url_parse(URL)
for i in soup.find_all("a", class_="button entry-content__button entry-content__button--smaller"):
link=i['href']
except:
pass
return link
data['article_source']=""
for i, rows in data.iterrows():
rows['article_source']= article_link(rows['url'])
问题
函数 url_parse 和 article_link 工作正常,但是当我使用函数 article_link 更新数据报中的单元格时,它在 1500 或 1000 [=30 后停止工作=]秒。我知道我的笔记本电脑可能有一个 IP 地址,但我不知道如何解决它,因为没有错误消息。
预期
函数article_link解析数据框内的所有URL。
import requests
from bs4 import BeautifulSoup
from concurrent.futures.thread import ThreadPoolExecutor
url = "https://www.poynter.org/ifcn-covid-19-misinformation/page/{}/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def main(url, num):
with requests.Session() as req:
print(f"Extracting Page# {num}")
r = req.get(url.format(num), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
links = [item.get("href") for item in soup.findAll(
"a", class_="button entry-content__button entry-content__button--smaller")]
return links
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(main, url, num) for num in range(1, 238)]
for future in futures:
print(future.result())
我有两个抓取网页并查找特定 class 并在其中找到 href 标签的功能。
url="https://www.poynter.org/ifcn-covid-19-misinformation/page/220/"
def url_parse(site):
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
return soup
def article_link(URL):
try:
soup=url_parse(URL)
for i in soup.find_all("a", class_="button entry-content__button entry-content__button--smaller"):
link=i['href']
except:
pass
return link
data['article_source']=""
for i, rows in data.iterrows():
rows['article_source']= article_link(rows['url'])
问题
函数 url_parse 和 article_link 工作正常,但是当我使用函数 article_link 更新数据报中的单元格时,它在 1500 或 1000 [=30 后停止工作=]秒。我知道我的笔记本电脑可能有一个 IP 地址,但我不知道如何解决它,因为没有错误消息。
预期
函数article_link解析数据框内的所有URL。
import requests
from bs4 import BeautifulSoup
from concurrent.futures.thread import ThreadPoolExecutor
url = "https://www.poynter.org/ifcn-covid-19-misinformation/page/{}/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def main(url, num):
with requests.Session() as req:
print(f"Extracting Page# {num}")
r = req.get(url.format(num), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
links = [item.get("href") for item in soup.findAll(
"a", class_="button entry-content__button entry-content__button--smaller")]
return links
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(main, url, num) for num in range(1, 238)]
for future in futures:
print(future.result())