如何将包含 pdf 的流转换为文本?
How to convert a stream that contains a pdf to text?
我正在使用 azure 函数提取 pdf 文件的文本。我想将流(由包含 pdf 的 azure blob 存储接收)转换为 pdf,以便我可以使用 this question here.
的代码
public static class PdfSharpExtensions
{
public static IEnumerable<string> ExtractText(this PdfPage page)
{
var content = ContentReader.ReadContent(page);
var text = content.ExtractText();
return text;
}
public static IEnumerable<string> ExtractText(this CObject cObject)
{
if (cObject is COperator)
{
var cOperator = cObject as COperator;
if (cOperator.OpCode.Name== OpCodeName.Tj.ToString() ||
cOperator.OpCode.Name == OpCodeName.TJ.ToString())
{
foreach (var cOperand in cOperator.Operands)
foreach (var txt in ExtractText(cOperand))
yield return txt;
}
}
else if (cObject is CSequence)
{
var cSequence = cObject as CSequence;
foreach (var element in cSequence)
foreach (var txt in ExtractText(element))
yield return txt;
}
else if (cObject is CString)
{
var cString = cObject as CString;
yield return cString.Value;
}
}
}
有办法吗?
据我了解,您需要从流中创建 PDF,然后使用 PDF 阅读内容。
所以首先我们需要从 MemoryStream 创建一个 PDF,但是等等,我们只有一个 Stream,所以我们需要像这样将它转换为 MemoryStream:
public static void CopyStream(Stream input, Stream output)
{
byte[] buffer = new byte[16*1024];
int read;
while((read = input.Read (buffer, 0, buffer.Length)) > 0)
{
output.Write (buffer, 0, read);
}
}
// Create MemoryStream
var ms = new MemoryStream();
CopyStream(streamFromDatabase, ms);
// Create PDF from MemoryStream
var pdf = PdfReader.Open(ms);
现在我们可以像这样从中读取文本:
var sb = new StringBuilder();
foreach (var page in pdf.Pages)
{
sb.Append(string.Join("", page.ExtractText().ToArray()));
}
我正在使用 azure 函数提取 pdf 文件的文本。我想将流(由包含 pdf 的 azure blob 存储接收)转换为 pdf,以便我可以使用 this question here.
的代码public static class PdfSharpExtensions
{
public static IEnumerable<string> ExtractText(this PdfPage page)
{
var content = ContentReader.ReadContent(page);
var text = content.ExtractText();
return text;
}
public static IEnumerable<string> ExtractText(this CObject cObject)
{
if (cObject is COperator)
{
var cOperator = cObject as COperator;
if (cOperator.OpCode.Name== OpCodeName.Tj.ToString() ||
cOperator.OpCode.Name == OpCodeName.TJ.ToString())
{
foreach (var cOperand in cOperator.Operands)
foreach (var txt in ExtractText(cOperand))
yield return txt;
}
}
else if (cObject is CSequence)
{
var cSequence = cObject as CSequence;
foreach (var element in cSequence)
foreach (var txt in ExtractText(element))
yield return txt;
}
else if (cObject is CString)
{
var cString = cObject as CString;
yield return cString.Value;
}
}
}
有办法吗?
据我了解,您需要从流中创建 PDF,然后使用 PDF 阅读内容。
所以首先我们需要从 MemoryStream 创建一个 PDF,但是等等,我们只有一个 Stream,所以我们需要像这样将它转换为 MemoryStream:
public static void CopyStream(Stream input, Stream output)
{
byte[] buffer = new byte[16*1024];
int read;
while((read = input.Read (buffer, 0, buffer.Length)) > 0)
{
output.Write (buffer, 0, read);
}
}
// Create MemoryStream
var ms = new MemoryStream();
CopyStream(streamFromDatabase, ms);
// Create PDF from MemoryStream
var pdf = PdfReader.Open(ms);
现在我们可以像这样从中读取文本:
var sb = new StringBuilder();
foreach (var page in pdf.Pages)
{
sb.Append(string.Join("", page.ExtractText().ToArray()));
}