从 google 驱动器抓取文件 - 自动查询被阻止
Scraping files from google drive - automated queries prevented by
我想从我很久以前读过的一个很棒的历史速成课程中抓取一些 pdf 文件。遗憾的是,旧网站已关闭,我只能设法从 archive.org 获取旧的 html 代码
(我得到的链接工作正常,例如:https://drive.google.com/file/d/0BzRJiIvdbSoKcHpGUWJBUDZ2WDA/edit?usp=sharing)。
此脚本导致 html 个文件被下载,说
,很抱歉,您的计算机或网络可能正在发送自动查询。为了保护我们的用户,我们现在无法处理您的请求。”
有没有办法绕过这个?我尝试在代码中加入一些随机延迟,所以这可能不够,或者我现在可能在 google 的黑名单上。
(可以在此处找到 text.txt 文件 https://filebin.net/k2qw09embamx05ey )
import requests
import time
import random
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
time.sleep(random.randrange(1,2))
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f = open('text.txt')
long_string = f.readlines()
interesting_strings = []
for item in long_string:
if 'drive.google' in item:
interesting_strings.append(item)
print(interesting_strings)
interesting_strings = interesting_strings[0]
interesting_strings = interesting_strings.split('https://web.archive.org/web/20161219093036/')
links = []
for item in interesting_strings:
if 'drive.google' in item:
idx = item.find('"')
links.append(item[:idx])
cntr = 1
for link in links:
print(link)
fname = './data/History_' + str(cntr)
file_id = link.split('/')[-2]
print('id:', file_id)
destination = fname
download_file_from_google_drive(file_id, destination)
print('Getting file #', str(cntr))
cntr += 1
time.sleep(random.randrange(3,15) + random.random())
使用gdown
:
import gdown
file_id = '0BzRJiIvdbSoKcHpGUWJBUDZ2WDA'
filename = 'file.pdf'
url = 'https://drive.google.com/uc?id=' + file_id
gdown.download(url, filename, quiet=False)
我想从我很久以前读过的一个很棒的历史速成课程中抓取一些 pdf 文件。遗憾的是,旧网站已关闭,我只能设法从 archive.org 获取旧的 html 代码 (我得到的链接工作正常,例如:https://drive.google.com/file/d/0BzRJiIvdbSoKcHpGUWJBUDZ2WDA/edit?usp=sharing)。
此脚本导致 html 个文件被下载,说
,很抱歉,您的计算机或网络可能正在发送自动查询。为了保护我们的用户,我们现在无法处理您的请求。”
有没有办法绕过这个?我尝试在代码中加入一些随机延迟,所以这可能不够,或者我现在可能在 google 的黑名单上。 (可以在此处找到 text.txt 文件 https://filebin.net/k2qw09embamx05ey )
import requests
import time
import random
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
time.sleep(random.randrange(1,2))
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f = open('text.txt')
long_string = f.readlines()
interesting_strings = []
for item in long_string:
if 'drive.google' in item:
interesting_strings.append(item)
print(interesting_strings)
interesting_strings = interesting_strings[0]
interesting_strings = interesting_strings.split('https://web.archive.org/web/20161219093036/')
links = []
for item in interesting_strings:
if 'drive.google' in item:
idx = item.find('"')
links.append(item[:idx])
cntr = 1
for link in links:
print(link)
fname = './data/History_' + str(cntr)
file_id = link.split('/')[-2]
print('id:', file_id)
destination = fname
download_file_from_google_drive(file_id, destination)
print('Getting file #', str(cntr))
cntr += 1
time.sleep(random.randrange(3,15) + random.random())
使用gdown
:
import gdown
file_id = '0BzRJiIvdbSoKcHpGUWJBUDZ2WDA'
filename = 'file.pdf'
url = 'https://drive.google.com/uc?id=' + file_id
gdown.download(url, filename, quiet=False)