使用 Google Apps 脚本从多页附件 PDF 中提取文本
Extract Text from Multipage Attachment PDF Using Google Apps Script
我有一个包含多个扫描页面的 Gmail 附件 PDF。当我使用 Google Apps 脚本将附件中的 blob 保存到云端硬盘文件时,从 Google 云端硬盘手动打开 PDF,然后 select 使用 Google 文档打开,所有PDF 中的文本显示为 Google 文档。但是,当我将 blob 保存为带有 OCR 的 Google 文档时,只有第一页图像中的文本被保存到文档中,可以手动或通过代码访问。
获取 blob 并对其进行处理的代码是:
function getAttachments(desiredLabel, processedLabel, emailQuery){
// Find emails
var threads = GmailApp.search(emailQuery);
if(threads.length > 0){
// Iterate through the emails
for(var i in threads){
var mesgs = threads[i].getMessages();
for(var j in mesgs){
var processingMesg = mesgs[j];
var attachments = processingMesg.getAttachments();
var processedAttachments = 0;
// Iterate through attachments
for(var k in attachments){
var attachment = attachments[k];
var attachmentName = attachment.getName();
var attachmentType = attachment.getContentType();
// Process PDFs
if (attachmentType.includes('pdf')) {
processedAttachments += 1;
var pdfBlob = attachment.copyBlob();
var filename = attachmentName + " " + processedAttachments;
processPDF(pdfBlob, filename);
}
}
}
}
}
}
function processPDF(pdfBlob, filename){
// Saves the blob as a PDF.
// All pages are displayed if I click on it from Google Drive after running this script.
let pdfFile = DriveApp.createFile(pdfBlob);
pdfFile.setName(filename);
// Saves the blob as an OCRed Doc.
let resources = {
title: filename,
mimeType: "application/pdf"
};
let options = {
ocr: true,
ocrLanguage: "en"
};
let file = Drive.Files.insert(resources, pdfBlob, options);
let fileID = file.getId();
// Open the file to get the text.
// Only the text of the image on the first page is available in the Doc.
let doc = DocumentApp.openById(fileID);
let docText = doc.getBody().getText();
}
如果我尝试使用 Google Docs 直接阅读没有 OCR 的 PDF,我会得到异常:参数无效,例如:
DocumentApp.openById(pdfFile.getId());
如何从 PDF 的所有页面中获取文本?
我有一个包含多个扫描页面的 Gmail 附件 PDF。当我使用 Google Apps 脚本将附件中的 blob 保存到云端硬盘文件时,从 Google 云端硬盘手动打开 PDF,然后 select 使用 Google 文档打开,所有PDF 中的文本显示为 Google 文档。但是,当我将 blob 保存为带有 OCR 的 Google 文档时,只有第一页图像中的文本被保存到文档中,可以手动或通过代码访问。
获取 blob 并对其进行处理的代码是:
function getAttachments(desiredLabel, processedLabel, emailQuery){
// Find emails
var threads = GmailApp.search(emailQuery);
if(threads.length > 0){
// Iterate through the emails
for(var i in threads){
var mesgs = threads[i].getMessages();
for(var j in mesgs){
var processingMesg = mesgs[j];
var attachments = processingMesg.getAttachments();
var processedAttachments = 0;
// Iterate through attachments
for(var k in attachments){
var attachment = attachments[k];
var attachmentName = attachment.getName();
var attachmentType = attachment.getContentType();
// Process PDFs
if (attachmentType.includes('pdf')) {
processedAttachments += 1;
var pdfBlob = attachment.copyBlob();
var filename = attachmentName + " " + processedAttachments;
processPDF(pdfBlob, filename);
}
}
}
}
}
}
function processPDF(pdfBlob, filename){
// Saves the blob as a PDF.
// All pages are displayed if I click on it from Google Drive after running this script.
let pdfFile = DriveApp.createFile(pdfBlob);
pdfFile.setName(filename);
// Saves the blob as an OCRed Doc.
let resources = {
title: filename,
mimeType: "application/pdf"
};
let options = {
ocr: true,
ocrLanguage: "en"
};
let file = Drive.Files.insert(resources, pdfBlob, options);
let fileID = file.getId();
// Open the file to get the text.
// Only the text of the image on the first page is available in the Doc.
let doc = DocumentApp.openById(fileID);
let docText = doc.getBody().getText();
}
如果我尝试使用 Google Docs 直接阅读没有 OCR 的 PDF,我会得到异常:参数无效,例如:
DocumentApp.openById(pdfFile.getId());
如何从 PDF 的所有页面中获取文本?