在 google 驱动器 python 中列出超过 10000 个文件
List more than 10000 files in google drive with python
我有一个 google 驱动器文件夹,其中包含超过 10000 个子文件夹。我正在尝试使用以下代码列出这些子文件夹:
import pickle
import os.path
import io
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from numpy import cumproduct
import pandas as pd
import gdown
from pyasn1.type.constraint import ContainedSubtypeConstraint
import requests
from googleapiclient.http import MediaIoBaseDownload
import httplib2
SCOPES = ['https://www.googleapis.com/auth/drive']
creds = None
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'test.json', SCOPES)
creds = flow.run_local_server(port=0)
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('drive', 'v3', credentials=creds)
folder_id='valid folder id'
query=f"parents = '{folder_id}'"
response=service.files().list(q=query).execute()
files=response.get('files')
nextPageToken=response.get('nextPageToken')
while nextPageToken:
response=service.files().list(q=query).execute()
files.extend(response.get('files'))
nextPageToken=response.get('nextPageToken')
df = pd.DataFrame(files)
print(df)
在调试时我发现它只对 100 个子文件夹有响应。我如何修改此脚本以列出所有 10000 多个子文件夹?
您似乎忘记了在 while 循环中使用 nextPageToken
请求中的 nextPageToken
值设置 pageToken
参数。
应该是这样的:
while nextPageToken:
response=service.files().list(pageToken=nextPageToken, q=query).execute()
files.extend(response.get('files'))
nextPageToken=response.get('nextPageToken')
您可能还需要考虑增加 pageSize
参数。pageSize
是每页 return 的最大文件数。可接受的值为 1 到 1000(含)。 (默认值:100)。参见 File.List() parameters
您的代码(与 pageSize):
service = build('drive', 'v3', credentials=creds)
folder_id='valid folder id'
query=f"parents = '{folder_id}'"
response=service.files().list(pageSize=1000, q=query).execute()
files=response.get('files')
nextPageToken=response.get('nextPageToken')
while nextPageToken:
response=service.files().list(pageSize=1000, pageToken=nextPageToken, q=query).execute()
files.extend(response.get('files'))
nextPageToken=response.get('nextPageToken')
另一个示例实现:
service = build('drive', 'v3', credentials=creds)
folder_id='valid folder id'
query=f"parents = '{folder_id}'"
page_token = None
my_files = list()
while True:
results = service.files().list(pageSize=1000, pageToken=page_token, q=query).execute()
files = results.get('files', [])
my_files.extend(files)
page_token = results.get('nextPageToken', None)
if page_token is None:
break
我有一个 google 驱动器文件夹,其中包含超过 10000 个子文件夹。我正在尝试使用以下代码列出这些子文件夹:
import pickle
import os.path
import io
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from numpy import cumproduct
import pandas as pd
import gdown
from pyasn1.type.constraint import ContainedSubtypeConstraint
import requests
from googleapiclient.http import MediaIoBaseDownload
import httplib2
SCOPES = ['https://www.googleapis.com/auth/drive']
creds = None
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'test.json', SCOPES)
creds = flow.run_local_server(port=0)
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('drive', 'v3', credentials=creds)
folder_id='valid folder id'
query=f"parents = '{folder_id}'"
response=service.files().list(q=query).execute()
files=response.get('files')
nextPageToken=response.get('nextPageToken')
while nextPageToken:
response=service.files().list(q=query).execute()
files.extend(response.get('files'))
nextPageToken=response.get('nextPageToken')
df = pd.DataFrame(files)
print(df)
在调试时我发现它只对 100 个子文件夹有响应。我如何修改此脚本以列出所有 10000 多个子文件夹?
您似乎忘记了在 while 循环中使用 nextPageToken
请求中的 nextPageToken
值设置 pageToken
参数。
应该是这样的:
while nextPageToken:
response=service.files().list(pageToken=nextPageToken, q=query).execute()
files.extend(response.get('files'))
nextPageToken=response.get('nextPageToken')
您可能还需要考虑增加 pageSize
参数。pageSize
是每页 return 的最大文件数。可接受的值为 1 到 1000(含)。 (默认值:100)。参见 File.List() parameters
您的代码(与 pageSize):
service = build('drive', 'v3', credentials=creds)
folder_id='valid folder id'
query=f"parents = '{folder_id}'"
response=service.files().list(pageSize=1000, q=query).execute()
files=response.get('files')
nextPageToken=response.get('nextPageToken')
while nextPageToken:
response=service.files().list(pageSize=1000, pageToken=nextPageToken, q=query).execute()
files.extend(response.get('files'))
nextPageToken=response.get('nextPageToken')
另一个示例实现:
service = build('drive', 'v3', credentials=creds)
folder_id='valid folder id'
query=f"parents = '{folder_id}'"
page_token = None
my_files = list()
while True:
results = service.files().list(pageSize=1000, pageToken=page_token, q=query).execute()
files = results.get('files', [])
my_files.extend(files)
page_token = results.get('nextPageToken', None)
if page_token is None:
break