使用线程池下载多个 url
Downloading multiple urls with threadpool
我在下载多个 url 时遇到问题。
我的代码每个会话仍然只下载 1 url。还需要完成第一个才能下载下一个。
我想同时下载 3 urls。
这是我的代码:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'
}
def download(path, video_url, bar: tqdm):
res = requests.get(video_url, headers, stream=True)
with open(path, 'wb') as f:
for b in res.iter_content(1024):
f.write(b)
bar.update(len(b))
def get_length(video_url):
res = requests.get(video_url, headers, stream=True)
le = int(res.headers['Content-Length'])
return le
def download_all(urls: list, thread: int = cpu_count()):
total = len(urls)
count = 0
pool = ThreadPool(thread) #
for url in urls:
output_file = get_url_path(url)
count += 1
content_length = get_length(video_url=url)
with tqdm(total=content_length, unit='B', ncols=(150-1), desc=f'Downloading {count} of {total}', unit_divisor=1024, ascii=True, unit_scale=True) as bar:
pool.apply_async(download(output_file, url, bar))
pool.close()
pool.join()
urls = read_lines('urls.txt')
download_all(urls)
这一行
pool.apply_async(download(output_file, url, bar))
必须是
pool.apply_async(download, (output_file, url, bar))
否则您将调用 download
方法而不是将其(和参数)传递给线程池。
编辑
使用 starmap
将 url 映射到执行下载的 func
(顺便说一句:您可以保护重复的 get-request)。并添加 position
参数。
老实说,酒吧不是很流畅,但我真的没有 tqdm
或 ThreadPool
的经验。但总的来说,下载似乎有效。
def download_all(urls: list, thread: int = cpu_count()):
total = len(urls)
pool = ThreadPool(thread)
def func(count, url):
output_file = get_url_path(url)
req = requests.get(url, headers=headers, stream=True)
content_length = int(req.headers['Content-Length'])
with tqdm(total=content_length, unit='B', desc=f'Downloading {count + 1} of {total}',
unit_divisor=1024, ascii=True, unit_scale=True, position=count, file=sys.stdout) as bar:
with open(output_file, 'wb') as f:
for b in req.iter_content(1024):
f.write(b)
bar.update(len(b))
pool.starmap(func, enumerate(urls))
pool.close()
pool.join()
我在下载多个 url 时遇到问题。 我的代码每个会话仍然只下载 1 url。还需要完成第一个才能下载下一个。
我想同时下载 3 urls。
这是我的代码:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'
}
def download(path, video_url, bar: tqdm):
res = requests.get(video_url, headers, stream=True)
with open(path, 'wb') as f:
for b in res.iter_content(1024):
f.write(b)
bar.update(len(b))
def get_length(video_url):
res = requests.get(video_url, headers, stream=True)
le = int(res.headers['Content-Length'])
return le
def download_all(urls: list, thread: int = cpu_count()):
total = len(urls)
count = 0
pool = ThreadPool(thread) #
for url in urls:
output_file = get_url_path(url)
count += 1
content_length = get_length(video_url=url)
with tqdm(total=content_length, unit='B', ncols=(150-1), desc=f'Downloading {count} of {total}', unit_divisor=1024, ascii=True, unit_scale=True) as bar:
pool.apply_async(download(output_file, url, bar))
pool.close()
pool.join()
urls = read_lines('urls.txt')
download_all(urls)
这一行
pool.apply_async(download(output_file, url, bar))
必须是
pool.apply_async(download, (output_file, url, bar))
否则您将调用 download
方法而不是将其(和参数)传递给线程池。
编辑
使用 starmap
将 url 映射到执行下载的 func
(顺便说一句:您可以保护重复的 get-request)。并添加 position
参数。
老实说,酒吧不是很流畅,但我真的没有 tqdm
或 ThreadPool
的经验。但总的来说,下载似乎有效。
def download_all(urls: list, thread: int = cpu_count()):
total = len(urls)
pool = ThreadPool(thread)
def func(count, url):
output_file = get_url_path(url)
req = requests.get(url, headers=headers, stream=True)
content_length = int(req.headers['Content-Length'])
with tqdm(total=content_length, unit='B', desc=f'Downloading {count + 1} of {total}',
unit_divisor=1024, ascii=True, unit_scale=True, position=count, file=sys.stdout) as bar:
with open(output_file, 'wb') as f:
for b in req.iter_content(1024):
f.write(b)
bar.update(len(b))
pool.starmap(func, enumerate(urls))
pool.close()
pool.join()