在 Python 中使用多处理或多线程来提高抓取速度
Use multiprocessing or multithreading to improve scraping speed in Python
我有一个爬虫代码如下:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from datetime import datetime
def crawl(id):
try:
url = 'https://www.china0001.com.cn/project/{0:06d}.html'.format(id)
print(url)
content = requests.get(url).text
soup = BeautifulSoup(content, 'lxml')
tbody = soup.find("table", attrs={"id":"mse_new"}).find("tbody", attrs={"class":"jg"})
tr = tbody.find_all("tr")
rows = []
for i in tr[1:]:
rows.append([j.text.strip() for j in i.findAll("td")])
out = dict([map(str.strip, y.split(':')) for x in rows for y in x])
return out
except AttributeError:
return False
data = list()
for id in range(699998, 700010):
print(id)
res = crawl(id)
if res:
data.append(res)
if len(data) > 0:
df = pd.DataFrame(data)
df.to_excel('test.xlsx', index = False)
有效,但是当我将 range
间隔变大时需要很长时间,所以我想也许我需要使用 multithreading
或 multiprocessing
或拆分 range()
分成多个块以提高抓取速度,但我不知道该怎么做。
有人可以帮忙吗?提前致谢。
更新:
MAX_WORKERS = 20 #play with it to get an optimal value
ids = list(range(699998, 700050))
workers = min(MAX_WORKERS, len(ids))
data = list()
for id in ids:
print(id)
res = crawl(id)
if res:
with futures.ThreadPoolExecutor(workers) as executor:
res = executor.map(crawl, ids)
data.append(res)
以下是我的处理方式:
from concurrent import futures
MAX_WORKERS = 20 #play with it to get an optimal value
ids = list(range(0, 10000))
workers = min(MAX_WORKERS, len(ids))
data = list()
with futures.ThreadPoolExecutor(workers) as executor:
res = executor.map(crawl, ids)
data.append(res)
不过我还没有测试过。但希望它能帮助你找到探索的方向。
我有一个爬虫代码如下:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from datetime import datetime
def crawl(id):
try:
url = 'https://www.china0001.com.cn/project/{0:06d}.html'.format(id)
print(url)
content = requests.get(url).text
soup = BeautifulSoup(content, 'lxml')
tbody = soup.find("table", attrs={"id":"mse_new"}).find("tbody", attrs={"class":"jg"})
tr = tbody.find_all("tr")
rows = []
for i in tr[1:]:
rows.append([j.text.strip() for j in i.findAll("td")])
out = dict([map(str.strip, y.split(':')) for x in rows for y in x])
return out
except AttributeError:
return False
data = list()
for id in range(699998, 700010):
print(id)
res = crawl(id)
if res:
data.append(res)
if len(data) > 0:
df = pd.DataFrame(data)
df.to_excel('test.xlsx', index = False)
有效,但是当我将 range
间隔变大时需要很长时间,所以我想也许我需要使用 multithreading
或 multiprocessing
或拆分 range()
分成多个块以提高抓取速度,但我不知道该怎么做。
有人可以帮忙吗?提前致谢。
更新:
MAX_WORKERS = 20 #play with it to get an optimal value
ids = list(range(699998, 700050))
workers = min(MAX_WORKERS, len(ids))
data = list()
for id in ids:
print(id)
res = crawl(id)
if res:
with futures.ThreadPoolExecutor(workers) as executor:
res = executor.map(crawl, ids)
data.append(res)
以下是我的处理方式:
from concurrent import futures
MAX_WORKERS = 20 #play with it to get an optimal value
ids = list(range(0, 10000))
workers = min(MAX_WORKERS, len(ids))
data = list()
with futures.ThreadPoolExecutor(workers) as executor:
res = executor.map(crawl, ids)
data.append(res)
不过我还没有测试过。但希望它能帮助你找到探索的方向。