在 google 驱动器 python 中列出超过 10000 个文件

List more than 10000 files in google drive with python

我有一个 google 驱动器文件夹,其中包含超过 10000 个子文件夹。我正在尝试使用以下代码列出这些子文件夹:

import pickle
import os.path
import io
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from numpy import cumproduct
import pandas as pd
import gdown
from pyasn1.type.constraint import ContainedSubtypeConstraint
import requests
from googleapiclient.http import MediaIoBaseDownload
import httplib2

SCOPES = ['https://www.googleapis.com/auth/drive']

creds = None
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
                'test.json', SCOPES)
        creds = flow.run_local_server(port=0)
    with open('token.pickle', 'wb') as token:
        pickle.dump(creds, token)

service = build('drive', 'v3', credentials=creds)

folder_id='valid folder id'
query=f"parents = '{folder_id}'"

response=service.files().list(q=query).execute()
files=response.get('files')
nextPageToken=response.get('nextPageToken')

while nextPageToken:
    response=service.files().list(q=query).execute()
    files.extend(response.get('files'))
    nextPageToken=response.get('nextPageToken')

df = pd.DataFrame(files)
print(df)

在调试时我发现它只对 100 个子文件夹有响应。我如何修改此脚本以列出所有 10000 多个子文件夹?

您似乎忘记了在 while 循环中使用 nextPageToken 请求中的 nextPageToken 值设置 pageToken 参数。

应该是这样的:

while nextPageToken:
    response=service.files().list(pageToken=nextPageToken, q=query).execute()
    files.extend(response.get('files'))
    nextPageToken=response.get('nextPageToken')

您可能还需要考虑增加 pageSize 参数。pageSize 是每页 return 的最大文件数。可接受的值为 1 到 1000(含)。 (默认值:100)。参见 File.List() parameters

您的代码(与 pageSize):

service = build('drive', 'v3', credentials=creds)

folder_id='valid folder id'
query=f"parents = '{folder_id}'"

response=service.files().list(pageSize=1000, q=query).execute()
files=response.get('files')
nextPageToken=response.get('nextPageToken')

while nextPageToken:
    response=service.files().list(pageSize=1000, pageToken=nextPageToken, q=query).execute()
    files.extend(response.get('files'))
    nextPageToken=response.get('nextPageToken')

另一个示例实现:

service = build('drive', 'v3', credentials=creds)
    
folder_id='valid folder id'
query=f"parents = '{folder_id}'"
page_token = None
my_files = list()
while True:
    results = service.files().list(pageSize=1000, pageToken=page_token, q=query).execute()
    files = results.get('files', [])
    my_files.extend(files)
    page_token = results.get('nextPageToken', None)
    if page_token is None:
        break