itext7 - 如何在写入期间过滤渲染事件
itext7 - How to filter render events during write
我想在 RENDER_TEXT 事件写入输出文件时对其进行过滤。我有一个 PDF,其中包含一些我想过滤掉的文本。我发现我可以遍历文档一次并确定我要过滤的渲染事件的特征。现在我想复制源文档的页面并跳过一些 RENDER_TEXT 事件,这样文本就不会出现在目标文档中。我有一个 IEventFilter 将接受正确的事件。我只需要知道如何将这个过滤器放在文档编写器上。
我们的目标是获取从 Google 日历中创建的议程格式的 PDF,并删除 "Created by:" 和 "Calendar:" 行。这些行通常由 3 RENDER_TEXT 个事件组成。
我当前的代码如下。我发现所有具有相同基线 y 坐标的 RENDER_TEXT 事件都将标识我要删除的事件。
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.itextpdf.kernel.geom.LineSegment;
import com.itextpdf.kernel.geom.PageSize;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.filter.IEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.IEventListener;
public class Main {
private static final Logger LOGGER = LogManager.getLogger();
public static void main(String[] args) throws FileNotFoundException, IOException {
final Path src = Paths.get("calendar_2018-08-04_2018-08-19.pdf");
final Path dest = Paths.get("/home/jpschewe/Downloads/calendar_clean.pdf");
final Main app = new Main(src, dest);
}
private Main(final Path src, final Path dest) throws FileNotFoundException, IOException {
try (PdfDocument srcDoc = new PdfDocument(new PdfReader(src.toFile()));
PdfDocument destDoc = new PdfDocument(new PdfWriter(dest.toFile()))) {
final Rectangle pageSize = srcDoc.getFirstPage().getPageSize();
for (int i = 1; i <= srcDoc.getNumberOfPages(); ++i) {
PdfPage page = srcDoc.getPage(i);
final GatherBaselines gatherBaselines = new GatherBaselines();
final PdfCanvasProcessor processor = new PdfCanvasProcessor(gatherBaselines);
processor.processPageContent(page);
LOGGER.info("Filter baselines for page {} -> {}", i, gatherBaselines.baselinesToFilter);
destDoc.setDefaultPageSize(new PageSize(pageSize));
destDoc.addNewPage();
}
}
}
public class FilterEventsByBaseline implements IEventFilter {
private final List<Float> baselinesToFilter;
public FilterEventsByBaseline(final List<Float> baselinesToFilter) {
this.baselinesToFilter = baselinesToFilter;
}
@Override
public boolean accept(final IEventData data, final EventType type) {
if (type.equals(EventType.RENDER_TEXT)) {
final TextRenderInfo renderInfo = (TextRenderInfo) data;
final LineSegment baseline = renderInfo.getBaseline();
final float checkY = baseline.getStartPoint().get(1);
final boolean filter = baselinesToFilter.stream().anyMatch(f -> Math.abs(checkY - f) < 1E-6);
return !filter;
}
return true;
}
}
public class GatherBaselines implements IEventListener {
// need to store all baselines that are problems
// the assumption is that all RENDER_TEXT operations with a baseline in the bad
// list need to be filtered when copying pages
private final List<Float> baselinesToFilter = new LinkedList<>();
@Override
public void eventOccurred(final IEventData data, final EventType type) {
if (type.equals(EventType.RENDER_TEXT)) {
final TextRenderInfo renderInfo = (TextRenderInfo) data;
final String text = renderInfo.getText();
final LineSegment baseline = renderInfo.getBaseline();
if (null != text && (text.contains("Calendar:") || text.contains("Created by:"))) {
// index 1 is the y coordinate
baselinesToFilter.add(baseline.getStartPoint().get(1));
}
}
}
@Override
public Set<EventType> getSupportedEvents() {
return Collections.singleton(EventType.RENDER_TEXT);
}
}
}
谢谢
根据评论中的建议,您可以使用 to filter the operations as desired from the content streams. Actually I slightly extended that class a bit to be able to properly support the '
and "
text drawing operators. You find that class here 中的 PdfCanvasEditor
。
就像您的方法一样,首先要确定要清除的行 运行:为此我使用了一个 RegexBasedLocationExtractionStrategy
实例。
此后,在 PdfCanvasEditor
步骤中,在这些行上绘制文本的说明更改为仅绘制空字符串。
不过,由于不是您检查的事件导致在此处绘制文本,而是更基本的运算符和操作数结构,因此确切的机制并非源自 IEventFilter
。但机制与您的方法相似。
try (PdfDocument pdfDocument = new PdfDocument(SOURCE_PDF_READER, TARGET_PDF_WRITER)) {
List<Rectangle> triggerRectangles = new ArrayList<>();
PdfCanvasEditor editor = new PdfCanvasEditor()
{
{
Field field = PdfCanvasProcessor.class.getDeclaredField("textMatrix");
field.setAccessible(true);
textMatrixField = field;
}
@Override
protected void nextOperation(PdfLiteral operator, List<PdfObject> operands) {
try {
recentTextMatrix = (Matrix)textMatrixField.get(this);
} catch (IllegalArgumentException | IllegalAccessException e) {
throw new RuntimeException(e);
}
}
@Override
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
String operatorString = operator.toString();
if (TEXT_SHOWING_OPERATORS.contains(operatorString))
{
Matrix matrix = null;
try {
matrix = recentTextMatrix.multiply(getGraphicsState().getCtm());
} catch (IllegalArgumentException e) {
throw new RuntimeException(e);
}
float y = matrix.get(Matrix.I32);
if (triggerRectangles.stream().anyMatch(rect -> rect.getBottom() <= y && y <= rect.getTop())) {
if ("TJ".equals(operatorString))
operands.set(0, new PdfArray());
else
operands.set(operands.size() - 2, new PdfString(""));
}
}
super.write(processor, operator, operands);
}
final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
final Field textMatrixField;
Matrix recentTextMatrix;
};
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
{
PdfPage page = pdfDocument.getPage(i);
Set<PdfName> xobjectNames = page.getResources().getResourceNames(PdfName.XObject);
for (PdfName xobjectName : xobjectNames) {
PdfFormXObject xobject = page.getResources().getForm(xobjectName);
byte[] content = xobject.getPdfObject().getBytes();
PdfResources resources = xobject.getResources();
RegexBasedLocationExtractionStrategy regexLocator = new RegexBasedLocationExtractionStrategy("Created by:|Calendar:");
new PdfCanvasProcessor(regexLocator).processContent(content, resources);
triggerRectangles.clear();
triggerRectangles.addAll(regexLocator.getResultantLocations().stream().map(loc -> loc.getRectangle()).collect(Collectors.toSet()));
PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), resources, pdfDocument);
editor.editContent(content, resources, pdfCanvas);
xobject.getPdfObject().setData(pdfCanvas.getContentStream().getBytes());
}
}
}
(EditPageContent 测试 testRemoveSpecificLinesCalendar
)
当心,这是一个概念验证,特别是为OP的用例定制:此处的PdfCanvasEditor
仅用于检查和编辑每个页面的第一级表单XObjects,因为PDF是从Google日历中创建的议程格式以 XObject 形式包含其所有页面内容,而 XObject 又在页面内容流中绘制。此外,文本应与页面顶部平行出现。
我想在 RENDER_TEXT 事件写入输出文件时对其进行过滤。我有一个 PDF,其中包含一些我想过滤掉的文本。我发现我可以遍历文档一次并确定我要过滤的渲染事件的特征。现在我想复制源文档的页面并跳过一些 RENDER_TEXT 事件,这样文本就不会出现在目标文档中。我有一个 IEventFilter 将接受正确的事件。我只需要知道如何将这个过滤器放在文档编写器上。
我们的目标是获取从 Google 日历中创建的议程格式的 PDF,并删除 "Created by:" 和 "Calendar:" 行。这些行通常由 3 RENDER_TEXT 个事件组成。
我当前的代码如下。我发现所有具有相同基线 y 坐标的 RENDER_TEXT 事件都将标识我要删除的事件。
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.itextpdf.kernel.geom.LineSegment;
import com.itextpdf.kernel.geom.PageSize;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.filter.IEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.IEventListener;
public class Main {
private static final Logger LOGGER = LogManager.getLogger();
public static void main(String[] args) throws FileNotFoundException, IOException {
final Path src = Paths.get("calendar_2018-08-04_2018-08-19.pdf");
final Path dest = Paths.get("/home/jpschewe/Downloads/calendar_clean.pdf");
final Main app = new Main(src, dest);
}
private Main(final Path src, final Path dest) throws FileNotFoundException, IOException {
try (PdfDocument srcDoc = new PdfDocument(new PdfReader(src.toFile()));
PdfDocument destDoc = new PdfDocument(new PdfWriter(dest.toFile()))) {
final Rectangle pageSize = srcDoc.getFirstPage().getPageSize();
for (int i = 1; i <= srcDoc.getNumberOfPages(); ++i) {
PdfPage page = srcDoc.getPage(i);
final GatherBaselines gatherBaselines = new GatherBaselines();
final PdfCanvasProcessor processor = new PdfCanvasProcessor(gatherBaselines);
processor.processPageContent(page);
LOGGER.info("Filter baselines for page {} -> {}", i, gatherBaselines.baselinesToFilter);
destDoc.setDefaultPageSize(new PageSize(pageSize));
destDoc.addNewPage();
}
}
}
public class FilterEventsByBaseline implements IEventFilter {
private final List<Float> baselinesToFilter;
public FilterEventsByBaseline(final List<Float> baselinesToFilter) {
this.baselinesToFilter = baselinesToFilter;
}
@Override
public boolean accept(final IEventData data, final EventType type) {
if (type.equals(EventType.RENDER_TEXT)) {
final TextRenderInfo renderInfo = (TextRenderInfo) data;
final LineSegment baseline = renderInfo.getBaseline();
final float checkY = baseline.getStartPoint().get(1);
final boolean filter = baselinesToFilter.stream().anyMatch(f -> Math.abs(checkY - f) < 1E-6);
return !filter;
}
return true;
}
}
public class GatherBaselines implements IEventListener {
// need to store all baselines that are problems
// the assumption is that all RENDER_TEXT operations with a baseline in the bad
// list need to be filtered when copying pages
private final List<Float> baselinesToFilter = new LinkedList<>();
@Override
public void eventOccurred(final IEventData data, final EventType type) {
if (type.equals(EventType.RENDER_TEXT)) {
final TextRenderInfo renderInfo = (TextRenderInfo) data;
final String text = renderInfo.getText();
final LineSegment baseline = renderInfo.getBaseline();
if (null != text && (text.contains("Calendar:") || text.contains("Created by:"))) {
// index 1 is the y coordinate
baselinesToFilter.add(baseline.getStartPoint().get(1));
}
}
}
@Override
public Set<EventType> getSupportedEvents() {
return Collections.singleton(EventType.RENDER_TEXT);
}
}
}
谢谢
根据评论中的建议,您可以使用 '
and "
text drawing operators. You find that class here 中的 PdfCanvasEditor
。
就像您的方法一样,首先要确定要清除的行 运行:为此我使用了一个 RegexBasedLocationExtractionStrategy
实例。
此后,在 PdfCanvasEditor
步骤中,在这些行上绘制文本的说明更改为仅绘制空字符串。
不过,由于不是您检查的事件导致在此处绘制文本,而是更基本的运算符和操作数结构,因此确切的机制并非源自 IEventFilter
。但机制与您的方法相似。
try (PdfDocument pdfDocument = new PdfDocument(SOURCE_PDF_READER, TARGET_PDF_WRITER)) {
List<Rectangle> triggerRectangles = new ArrayList<>();
PdfCanvasEditor editor = new PdfCanvasEditor()
{
{
Field field = PdfCanvasProcessor.class.getDeclaredField("textMatrix");
field.setAccessible(true);
textMatrixField = field;
}
@Override
protected void nextOperation(PdfLiteral operator, List<PdfObject> operands) {
try {
recentTextMatrix = (Matrix)textMatrixField.get(this);
} catch (IllegalArgumentException | IllegalAccessException e) {
throw new RuntimeException(e);
}
}
@Override
protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
{
String operatorString = operator.toString();
if (TEXT_SHOWING_OPERATORS.contains(operatorString))
{
Matrix matrix = null;
try {
matrix = recentTextMatrix.multiply(getGraphicsState().getCtm());
} catch (IllegalArgumentException e) {
throw new RuntimeException(e);
}
float y = matrix.get(Matrix.I32);
if (triggerRectangles.stream().anyMatch(rect -> rect.getBottom() <= y && y <= rect.getTop())) {
if ("TJ".equals(operatorString))
operands.set(0, new PdfArray());
else
operands.set(operands.size() - 2, new PdfString(""));
}
}
super.write(processor, operator, operands);
}
final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
final Field textMatrixField;
Matrix recentTextMatrix;
};
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
{
PdfPage page = pdfDocument.getPage(i);
Set<PdfName> xobjectNames = page.getResources().getResourceNames(PdfName.XObject);
for (PdfName xobjectName : xobjectNames) {
PdfFormXObject xobject = page.getResources().getForm(xobjectName);
byte[] content = xobject.getPdfObject().getBytes();
PdfResources resources = xobject.getResources();
RegexBasedLocationExtractionStrategy regexLocator = new RegexBasedLocationExtractionStrategy("Created by:|Calendar:");
new PdfCanvasProcessor(regexLocator).processContent(content, resources);
triggerRectangles.clear();
triggerRectangles.addAll(regexLocator.getResultantLocations().stream().map(loc -> loc.getRectangle()).collect(Collectors.toSet()));
PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), resources, pdfDocument);
editor.editContent(content, resources, pdfCanvas);
xobject.getPdfObject().setData(pdfCanvas.getContentStream().getBytes());
}
}
}
(EditPageContent 测试 testRemoveSpecificLinesCalendar
)
当心,这是一个概念验证,特别是为OP的用例定制:此处的PdfCanvasEditor
仅用于检查和编辑每个页面的第一级表单XObjects,因为PDF是从Google日历中创建的议程格式以 XObject 形式包含其所有页面内容,而 XObject 又在页面内容流中绘制。此外,文本应与页面顶部平行出现。