如何使用 google 文档将 PDF 内容提取到 .txt 文件中?
How to extract PDF content into .txt file with google docs?
如何以编程方式使用 google 文档以编程方式从 pdf 文件中提取文本?。我都知道还有其他选择,但是,我很好奇是否可以将 google 文档用于此类目的。
当使用python将PDF数据检索为文本数据时,可以使用Drive API v3实现。但是需要2个步骤。
- 将 PDF 文件上传为 Google 文档
- 将 Google 文档下载为 TXT 文件
在此示例中,使用了 Python 快速入门。详细信息为https://developers.google.com/drive/v3/web/quickstart/python。
请阅读 "Step 1: Turn on the Drive API" 和 "Step 2: Install the Google Client Library"。如果你已经认识他们了,对不起。
当您使用下面的示例脚本时,请修改如下。
1。添加导入
请将以下导入添加到快速入门。
import io
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
2。更改范围
请将范围更改为以下内容。
SCOPES = 'https://www.googleapis.com/auth/drive'
3。更改 main()
请将 Quickstart 的 main()
改成这个。
示例脚本
示例脚本可以将 PDF 文件转换为 TXT 文件。但是PDF文件中的图片不能是TXT文件。
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
pdffile = 'sample.pdf' # PDF file
txtfile = 'sample.txt' # Text file
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': pdffile,
'mimeType': mime
},
media_body=MediaFileUpload(pdffile, mimetype=mime, resumable=True)
).execute()
dl = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = dl.next_chunk()
print("Done.")
if __name__ == '__main__':
main()
如果我误解了你的问题,我很抱歉。
已添加脚本快速入门:
from __future__ import print_function
import httplib2
import os
import io
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
credential_path = os.path.join("./", 'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
pdffile = '../Downloads/sample.pdf' # PDF file
txtfile = '../Downloads/sample.txt' # Text file
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': pdffile,
'mimeType': mime
},
media_body=MediaFileUpload(pdffile, mimetype=mime, resumable=True)
).execute()
dl = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = dl.next_chunk()
print("Done.")
if __name__ == '__main__':
main()
如何以编程方式使用 google 文档以编程方式从 pdf 文件中提取文本?。我都知道还有其他选择,但是,我很好奇是否可以将 google 文档用于此类目的。
当使用python将PDF数据检索为文本数据时,可以使用Drive API v3实现。但是需要2个步骤。
- 将 PDF 文件上传为 Google 文档
- 将 Google 文档下载为 TXT 文件
在此示例中,使用了 Python 快速入门。详细信息为https://developers.google.com/drive/v3/web/quickstart/python。 请阅读 "Step 1: Turn on the Drive API" 和 "Step 2: Install the Google Client Library"。如果你已经认识他们了,对不起。
当您使用下面的示例脚本时,请修改如下。
1。添加导入
请将以下导入添加到快速入门。
import io
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
2。更改范围
请将范围更改为以下内容。
SCOPES = 'https://www.googleapis.com/auth/drive'
3。更改 main()
请将 Quickstart 的 main()
改成这个。
示例脚本
示例脚本可以将 PDF 文件转换为 TXT 文件。但是PDF文件中的图片不能是TXT文件。
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
pdffile = 'sample.pdf' # PDF file
txtfile = 'sample.txt' # Text file
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': pdffile,
'mimeType': mime
},
media_body=MediaFileUpload(pdffile, mimetype=mime, resumable=True)
).execute()
dl = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = dl.next_chunk()
print("Done.")
if __name__ == '__main__':
main()
如果我误解了你的问题,我很抱歉。
已添加脚本快速入门:
from __future__ import print_function
import httplib2
import os
import io
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
credential_path = os.path.join("./", 'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
pdffile = '../Downloads/sample.pdf' # PDF file
txtfile = '../Downloads/sample.txt' # Text file
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': pdffile,
'mimeType': mime
},
media_body=MediaFileUpload(pdffile, mimetype=mime, resumable=True)
).execute()
dl = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = dl.next_chunk()
print("Done.")
if __name__ == '__main__':
main()