访问 AcroFields 缓慢 (iTextSharp)
Slow access to AcroFields (iTextSharp)
我正在使用 iTextSharp 从 PDF 中提取签名名称。
我在访问大尺寸和多页 PDF(~40MB 和~5000 页)的 AcroFiels 时遇到问题(速度过慢)。
这是我的代码片段:
using iTextSharp.text.pdf;
private static List<byte[]> GetSignsFromPDF(string filePath)
{
var result = new List<byte[]>();
var randomAccessFileOrArray = new RandomAccessFileOrArray(filePath);
var reader = new PdfReader(randomAccessFileOrArray, null);
var fields = reader.AcroFields;
if (fields == null)
{
return result;
}
var signatureNames = fields.GetSignatureNames();
signatureNames.Sort();
foreach (string name in signatureNames)
{
var sigDict = fields.GetSignatureDictionary(name);
var contents = sigDict.GetAsString(PdfName.CONTENTS);
if (contents != null)
{
result.Add(contents.GetOriginalBytes());
}
}
return result;
}
有 smarter/faster 访问 AcroFields 的方式,还是我应该等待 iTextSharp 的东西?
非常感谢。
在评论中有人推测 速度过慢 是由于 iText(Sharp) 在 AcroFields
实例中的字段集合初始化期间造成的不仅检查 Catalog -> AcroForm -> Fields 中引用的字段,而且(实际上是最重要的)来自所有文档页面的 ANNOTS。
幸运的是,这种初始化不会在 AcroFields
构造函数中进行,因此我们可以在不检查所有页面的情况下注入检索到的字段集合。
以下方法是内部AcroFields
方法Fill
(负责延迟初始化)的副本,删除了页面遍历并通过反射启用了对隐藏成员的访问。可以用来验证猜想
void fill(PdfReader reader, AcroFields acroFields)
{
IDictionary<string, AcroFields.Item> fields = new LinkedDictionary<string, AcroFields.Item>();
PdfDictionary top = (PdfDictionary)PdfReader.GetPdfObjectRelease(reader.Catalog.Get(PdfName.ACROFORM));
if (top == null)
return;
PdfBoolean needappearances = top.GetAsBoolean(PdfName.NEEDAPPEARANCES);
if (needappearances == null || !needappearances.BooleanValue)
acroFields.GenerateAppearances = true;
else
acroFields.GenerateAppearances = false;
PdfArray arrfds = (PdfArray)PdfReader.GetPdfObjectRelease(top.Get(PdfName.FIELDS));
if (arrfds == null || arrfds.Size == 0)
return;
System.Reflection.FieldInfo valuesField = typeof(AcroFields.Item).GetField("values", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo widgetsField = typeof(AcroFields.Item).GetField("widgets", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo widgetRefsField = typeof(AcroFields.Item).GetField("widget_refs", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo mergedField = typeof(AcroFields.Item).GetField("merged", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo pageField = typeof(AcroFields.Item).GetField("page", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo tabOrderField = typeof(AcroFields.Item).GetField("tabOrder", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
for (int j = 0; j < arrfds.Size; ++j)
{
PdfDictionary annot = arrfds.GetAsDict(j);
if (annot == null)
{
PdfReader.ReleaseLastXrefPartial(arrfds.GetAsIndirectObject(j));
continue;
}
if (!PdfName.WIDGET.Equals(annot.GetAsName(PdfName.SUBTYPE)))
{
PdfReader.ReleaseLastXrefPartial(arrfds.GetAsIndirectObject(j));
continue;
}
PdfArray kids = (PdfArray)PdfReader.GetPdfObjectRelease(annot.Get(PdfName.KIDS));
if (kids != null)
continue;
PdfDictionary dic = new PdfDictionary();
dic.Merge(annot);
PdfString t = annot.GetAsString(PdfName.T);
if (t == null)
continue;
String name = t.ToUnicodeString();
if (fields.ContainsKey(name))
continue;
AcroFields.Item item = new AcroFields.Item();
fields[name] = item;
((List<PdfDictionary>)valuesField.GetValue(item)).Add(dic); // item.AddValue(dic);
((List<PdfDictionary>)widgetsField.GetValue(item)).Add(dic); // item.AddWidget(dic);
((List<PdfIndirectReference>)widgetRefsField.GetValue(item)).Add(arrfds.GetAsIndirectObject(j)); //item.AddWidgetRef(arrfds.GetAsIndirectObject(j)); // must be a reference
((List<PdfDictionary>)mergedField.GetValue(item)).Add(dic); // item.AddMerged(dic);
((List<int>)pageField.GetValue(item)).Add((int)-1); // item.AddPage(-1);
((List<int>)tabOrderField.GetValue(item)).Add((int)-1); // item.AddTabOrder(-1);
}
System.Reflection.FieldInfo fieldsField = typeof(AcroFields).GetField("fields", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
fieldsField.SetValue(acroFields, fields);
}
应该尽早为AcroFields
实例调用它,例如:
using (PdfReader reader = new PdfReader(file))
{
AcroFields acroFields = reader.AcroFields;
fill(reader, acroFields);
...
如果使用此方法大大减少时间(同时提供所需的字段),则猜想得到证实。
查看代码,您会发现它没有正确遍历字段结构:字段可能按层次排列,但代码仅考虑第一级元素。不过,对于上述猜想的第一次测试应该足够了。
我正在使用 iTextSharp 从 PDF 中提取签名名称。 我在访问大尺寸和多页 PDF(~40MB 和~5000 页)的 AcroFiels 时遇到问题(速度过慢)。
这是我的代码片段:
using iTextSharp.text.pdf;
private static List<byte[]> GetSignsFromPDF(string filePath)
{
var result = new List<byte[]>();
var randomAccessFileOrArray = new RandomAccessFileOrArray(filePath);
var reader = new PdfReader(randomAccessFileOrArray, null);
var fields = reader.AcroFields;
if (fields == null)
{
return result;
}
var signatureNames = fields.GetSignatureNames();
signatureNames.Sort();
foreach (string name in signatureNames)
{
var sigDict = fields.GetSignatureDictionary(name);
var contents = sigDict.GetAsString(PdfName.CONTENTS);
if (contents != null)
{
result.Add(contents.GetOriginalBytes());
}
}
return result;
}
有 smarter/faster 访问 AcroFields 的方式,还是我应该等待 iTextSharp 的东西?
非常感谢。
在评论中有人推测 速度过慢 是由于 iText(Sharp) 在 AcroFields
实例中的字段集合初始化期间造成的不仅检查 Catalog -> AcroForm -> Fields 中引用的字段,而且(实际上是最重要的)来自所有文档页面的 ANNOTS。
幸运的是,这种初始化不会在 AcroFields
构造函数中进行,因此我们可以在不检查所有页面的情况下注入检索到的字段集合。
以下方法是内部AcroFields
方法Fill
(负责延迟初始化)的副本,删除了页面遍历并通过反射启用了对隐藏成员的访问。可以用来验证猜想
void fill(PdfReader reader, AcroFields acroFields)
{
IDictionary<string, AcroFields.Item> fields = new LinkedDictionary<string, AcroFields.Item>();
PdfDictionary top = (PdfDictionary)PdfReader.GetPdfObjectRelease(reader.Catalog.Get(PdfName.ACROFORM));
if (top == null)
return;
PdfBoolean needappearances = top.GetAsBoolean(PdfName.NEEDAPPEARANCES);
if (needappearances == null || !needappearances.BooleanValue)
acroFields.GenerateAppearances = true;
else
acroFields.GenerateAppearances = false;
PdfArray arrfds = (PdfArray)PdfReader.GetPdfObjectRelease(top.Get(PdfName.FIELDS));
if (arrfds == null || arrfds.Size == 0)
return;
System.Reflection.FieldInfo valuesField = typeof(AcroFields.Item).GetField("values", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo widgetsField = typeof(AcroFields.Item).GetField("widgets", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo widgetRefsField = typeof(AcroFields.Item).GetField("widget_refs", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo mergedField = typeof(AcroFields.Item).GetField("merged", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo pageField = typeof(AcroFields.Item).GetField("page", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
System.Reflection.FieldInfo tabOrderField = typeof(AcroFields.Item).GetField("tabOrder", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
for (int j = 0; j < arrfds.Size; ++j)
{
PdfDictionary annot = arrfds.GetAsDict(j);
if (annot == null)
{
PdfReader.ReleaseLastXrefPartial(arrfds.GetAsIndirectObject(j));
continue;
}
if (!PdfName.WIDGET.Equals(annot.GetAsName(PdfName.SUBTYPE)))
{
PdfReader.ReleaseLastXrefPartial(arrfds.GetAsIndirectObject(j));
continue;
}
PdfArray kids = (PdfArray)PdfReader.GetPdfObjectRelease(annot.Get(PdfName.KIDS));
if (kids != null)
continue;
PdfDictionary dic = new PdfDictionary();
dic.Merge(annot);
PdfString t = annot.GetAsString(PdfName.T);
if (t == null)
continue;
String name = t.ToUnicodeString();
if (fields.ContainsKey(name))
continue;
AcroFields.Item item = new AcroFields.Item();
fields[name] = item;
((List<PdfDictionary>)valuesField.GetValue(item)).Add(dic); // item.AddValue(dic);
((List<PdfDictionary>)widgetsField.GetValue(item)).Add(dic); // item.AddWidget(dic);
((List<PdfIndirectReference>)widgetRefsField.GetValue(item)).Add(arrfds.GetAsIndirectObject(j)); //item.AddWidgetRef(arrfds.GetAsIndirectObject(j)); // must be a reference
((List<PdfDictionary>)mergedField.GetValue(item)).Add(dic); // item.AddMerged(dic);
((List<int>)pageField.GetValue(item)).Add((int)-1); // item.AddPage(-1);
((List<int>)tabOrderField.GetValue(item)).Add((int)-1); // item.AddTabOrder(-1);
}
System.Reflection.FieldInfo fieldsField = typeof(AcroFields).GetField("fields", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
fieldsField.SetValue(acroFields, fields);
}
应该尽早为AcroFields
实例调用它,例如:
using (PdfReader reader = new PdfReader(file))
{
AcroFields acroFields = reader.AcroFields;
fill(reader, acroFields);
...
如果使用此方法大大减少时间(同时提供所需的字段),则猜想得到证实。
查看代码,您会发现它没有正确遍历字段结构:字段可能按层次排列,但代码仅考虑第一级元素。不过,对于上述猜想的第一次测试应该足够了。