RED-3114: Fixed unreliable redactions on rotated pages
This commit is contained in:
parent
681d17ad95
commit
f1071de3a9
@ -0,0 +1,377 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.WeakHashMap;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||||
|
import org.apache.fontbox.util.BoundingBox;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
||||||
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
||||||
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
import org.apache.pdfbox.util.Matrix;
|
||||||
|
import org.apache.pdfbox.util.Vector;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.DrawObject;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.Restore;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.Save;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.BeginText;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.EndText;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.MoveText;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.NextLine;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
||||||
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
|
||||||
|
*
|
||||||
|
* This class exists only so that we don't break the code of users who have their own subclasses of
|
||||||
|
* PDFTextStripper. It replaces the mostly empty implementation of showGlyph() in PDFStreamEngine
|
||||||
|
* with a heuristic implementation which is backwards compatible.
|
||||||
|
*
|
||||||
|
* DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
|
||||||
|
* THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings({"PMD", "checkstyle:all"})
|
||||||
|
class LegacyPDFStreamEngine extends PDFStreamEngine
|
||||||
|
{
|
||||||
|
private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class);
|
||||||
|
|
||||||
|
private int pageRotation;
|
||||||
|
private PDRectangle pageSize;
|
||||||
|
private Matrix translateMatrix;
|
||||||
|
private final GlyphList glyphList;
|
||||||
|
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor.
|
||||||
|
*/
|
||||||
|
LegacyPDFStreamEngine() throws IOException
|
||||||
|
{
|
||||||
|
addOperator(new BeginText());
|
||||||
|
addOperator(new Concatenate());
|
||||||
|
addOperator(new DrawObject()); // special text version
|
||||||
|
addOperator(new EndText());
|
||||||
|
addOperator(new SetGraphicsStateParameters());
|
||||||
|
addOperator(new Save());
|
||||||
|
addOperator(new Restore());
|
||||||
|
addOperator(new NextLine());
|
||||||
|
addOperator(new SetCharSpacing());
|
||||||
|
addOperator(new MoveText());
|
||||||
|
addOperator(new MoveTextSetLeading());
|
||||||
|
addOperator(new SetFontAndSize());
|
||||||
|
addOperator(new ShowText());
|
||||||
|
addOperator(new ShowTextAdjusted());
|
||||||
|
addOperator(new SetTextLeading());
|
||||||
|
addOperator(new SetMatrix());
|
||||||
|
addOperator(new SetTextRenderingMode());
|
||||||
|
addOperator(new SetTextRise());
|
||||||
|
addOperator(new SetWordSpacing());
|
||||||
|
addOperator(new SetTextHorizontalScaling());
|
||||||
|
addOperator(new ShowTextLine());
|
||||||
|
addOperator(new ShowTextLineAndSpace());
|
||||||
|
|
||||||
|
// load additional glyph list for Unicode mapping
|
||||||
|
String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt";
|
||||||
|
InputStream input = GlyphList.class.getResourceAsStream(path);
|
||||||
|
glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This will initialize and process the contents of the stream.
|
||||||
|
*
|
||||||
|
* @param page the page to process
|
||||||
|
* @throws java.io.IOException if there is an error accessing the stream.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void processPage(PDPage page) throws IOException
|
||||||
|
{
|
||||||
|
this.pageRotation = page.getRotation();
|
||||||
|
this.pageSize = page.getCropBox();
|
||||||
|
|
||||||
|
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0)
|
||||||
|
{
|
||||||
|
translateMatrix = null;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// translation matrix for cropbox
|
||||||
|
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
|
||||||
|
}
|
||||||
|
super.processPage(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called when a glyph is to be processed. The heuristic calculations here were originally
|
||||||
|
* written by Ben Litchfield for PDFStreamEngine.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,
|
||||||
|
String unicode,
|
||||||
|
Vector displacement)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// legacy calculations which were previously in PDFStreamEngine
|
||||||
|
//
|
||||||
|
// DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
|
||||||
|
// THIS CODE IS DELIBERATELY INCORRECT
|
||||||
|
//
|
||||||
|
|
||||||
|
PDGraphicsState state = getGraphicsState();
|
||||||
|
Matrix ctm = state.getCurrentTransformationMatrix();
|
||||||
|
float fontSize = state.getTextState().getFontSize();
|
||||||
|
float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
|
||||||
|
Matrix textMatrix = getTextMatrix();
|
||||||
|
|
||||||
|
float displacementX = displacement.getX();
|
||||||
|
// the sorting algorithm is based on the width of the character. As the displacement
|
||||||
|
// for vertical characters doesn't provide any suitable value for it, we have to
|
||||||
|
// calculate our own
|
||||||
|
if (font.isVertical())
|
||||||
|
{
|
||||||
|
displacementX = font.getWidth(code) / 1000;
|
||||||
|
// there may be an additional scaling factor for true type fonts
|
||||||
|
TrueTypeFont ttf = null;
|
||||||
|
if (font instanceof PDTrueTypeFont)
|
||||||
|
{
|
||||||
|
ttf = ((PDTrueTypeFont)font).getTrueTypeFont();
|
||||||
|
}
|
||||||
|
else if (font instanceof PDType0Font)
|
||||||
|
{
|
||||||
|
PDCIDFont cidFont = ((PDType0Font)font).getDescendantFont();
|
||||||
|
if (cidFont instanceof PDCIDFontType2)
|
||||||
|
{
|
||||||
|
ttf = ((PDCIDFontType2)cidFont).getTrueTypeFont();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (ttf != null && ttf.getUnitsPerEm() != 1000)
|
||||||
|
{
|
||||||
|
displacementX *= 1000f / ttf.getUnitsPerEm();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// legacy calculations which were previously in PDFStreamEngine
|
||||||
|
//
|
||||||
|
// DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
|
||||||
|
// THIS CODE IS DELIBERATELY INCORRECT
|
||||||
|
//
|
||||||
|
|
||||||
|
// (modified) combined displacement, this is calculated *without* taking the character
|
||||||
|
// spacing and word spacing into account, due to legacy code in TextStripper
|
||||||
|
float tx = displacementX * fontSize * horizontalScaling;
|
||||||
|
float ty = displacement.getY() * fontSize;
|
||||||
|
|
||||||
|
// (modified) combined displacement matrix
|
||||||
|
Matrix td = Matrix.getTranslateInstance(tx, ty);
|
||||||
|
|
||||||
|
// (modified) text rendering matrix
|
||||||
|
Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
|
||||||
|
float nextX = nextTextRenderingMatrix.getTranslateX();
|
||||||
|
float nextY = nextTextRenderingMatrix.getTranslateY();
|
||||||
|
|
||||||
|
// (modified) width and height calculations
|
||||||
|
float dxDisplay = nextX - textRenderingMatrix.getTranslateX();
|
||||||
|
Float fontHeight = fontHeightMap.get(font.getCOSObject());
|
||||||
|
if (fontHeight == null)
|
||||||
|
{
|
||||||
|
fontHeight = computeFontHeight(font);
|
||||||
|
fontHeightMap.put(font.getCOSObject(), fontHeight);
|
||||||
|
}
|
||||||
|
float dyDisplay = fontHeight * textRenderingMatrix.getScalingFactorY();
|
||||||
|
|
||||||
|
//
|
||||||
|
// start of the original method
|
||||||
|
//
|
||||||
|
|
||||||
|
// Note on variable names. There are three different units being used in this code.
|
||||||
|
// Character sizes are given in glyph units, text locations are initially given in text
|
||||||
|
// units, and we want to save the data in display units. The variable names should end with
|
||||||
|
// Text or Disp to represent if the values are in text or disp units (no glyph units are
|
||||||
|
// saved).
|
||||||
|
|
||||||
|
float glyphSpaceToTextSpaceFactor = 1 / 1000f;
|
||||||
|
if (font instanceof PDType3Font)
|
||||||
|
{
|
||||||
|
glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX();
|
||||||
|
}
|
||||||
|
|
||||||
|
float spaceWidthText = 0;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// to avoid crash as described in PDFBOX-614, see what the space displacement should be
|
||||||
|
spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor;
|
||||||
|
}
|
||||||
|
catch (Throwable exception)
|
||||||
|
{
|
||||||
|
LOG.warn(exception, exception);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (spaceWidthText == 0)
|
||||||
|
{
|
||||||
|
spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor;
|
||||||
|
// the average space width appears to be higher than necessary so make it smaller
|
||||||
|
spaceWidthText *= .80f;
|
||||||
|
}
|
||||||
|
if (spaceWidthText == 0)
|
||||||
|
{
|
||||||
|
spaceWidthText = 1.0f; // if could not find font, use a generic value
|
||||||
|
}
|
||||||
|
|
||||||
|
// the space width has to be transformed into display units
|
||||||
|
float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX();
|
||||||
|
|
||||||
|
// use our additional glyph list for Unicode mapping
|
||||||
|
String unicodeMapping = font.toUnicode(code, glyphList);
|
||||||
|
|
||||||
|
// when there is no Unicode mapping available, Acrobat simply coerces the character code
|
||||||
|
// into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want
|
||||||
|
// this, which is why we leave it until this point in PDFTextStreamEngine.
|
||||||
|
if (unicodeMapping == null)
|
||||||
|
{
|
||||||
|
if (font instanceof PDSimpleFont)
|
||||||
|
{
|
||||||
|
char c = (char) code;
|
||||||
|
unicodeMapping = new String(new char[] { c });
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Acrobat doesn't seem to coerce composite font's character codes, instead it
|
||||||
|
// skips them. See the "allah2.pdf" TestTextStripper file.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// adjust for cropbox if needed
|
||||||
|
Matrix translatedTextRenderingMatrix;
|
||||||
|
if (translateMatrix == null)
|
||||||
|
{
|
||||||
|
translatedTextRenderingMatrix = textRenderingMatrix;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
|
||||||
|
nextX -= pageSize.getLowerLeftX();
|
||||||
|
nextY -= pageSize.getLowerLeftY();
|
||||||
|
}
|
||||||
|
|
||||||
|
processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(),
|
||||||
|
pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY,
|
||||||
|
Math.abs(dyDisplay), dxDisplay,
|
||||||
|
Math.abs(spaceWidthDisplay), unicodeMapping, new int[] { code }, font,
|
||||||
|
fontSize,
|
||||||
|
(int)(fontSize * textMatrix.getScalingFactorX())));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute the font height. Override this if you want to use own calculations.
|
||||||
|
*
|
||||||
|
* @param font the font.
|
||||||
|
* @return the font height.
|
||||||
|
*
|
||||||
|
* @throws IOException if there is an error while getting the font bounding box.
|
||||||
|
*/
|
||||||
|
protected float computeFontHeight(PDFont font) throws IOException
|
||||||
|
{
|
||||||
|
BoundingBox bbox = font.getBoundingBox();
|
||||||
|
if (bbox.getLowerLeftY() < Short.MIN_VALUE)
|
||||||
|
{
|
||||||
|
// PDFBOX-2158 and PDFBOX-3130
|
||||||
|
// files by Salmat eSolutions / ClibPDF Library
|
||||||
|
bbox.setLowerLeftY(- (bbox.getLowerLeftY() + 65536));
|
||||||
|
}
|
||||||
|
// 1/2 the bbox is used as the height todo: why?
|
||||||
|
float glyphHeight = bbox.getHeight() / 2;
|
||||||
|
|
||||||
|
// sometimes the bbox has very high values, but CapHeight is OK
|
||||||
|
PDFontDescriptor fontDescriptor = font.getFontDescriptor();
|
||||||
|
if (fontDescriptor != null)
|
||||||
|
{
|
||||||
|
float capHeight = fontDescriptor.getCapHeight();
|
||||||
|
if (Float.compare(capHeight, 0) != 0 &&
|
||||||
|
(capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0))
|
||||||
|
{
|
||||||
|
glyphHeight = capHeight;
|
||||||
|
}
|
||||||
|
// PDFBOX-3464, PDFBOX-4480, PDFBOX-4553:
|
||||||
|
// sometimes even CapHeight has very high value, but Ascent and Descent are ok
|
||||||
|
float ascent = fontDescriptor.getAscent();
|
||||||
|
float descent = fontDescriptor.getDescent();
|
||||||
|
if (capHeight > ascent && ascent > 0 && descent < 0 &&
|
||||||
|
((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0))
|
||||||
|
{
|
||||||
|
glyphHeight = (ascent - descent) / 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// transformPoint from glyph space -> text space
|
||||||
|
float height;
|
||||||
|
if (font instanceof PDType3Font)
|
||||||
|
{
|
||||||
|
height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
height = glyphHeight / 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
return height;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A method provided as an event interface to allow a subclass to perform some specific
|
||||||
|
* functionality when text needs to be processed.
|
||||||
|
*
|
||||||
|
* @param text The text to be processed.
|
||||||
|
*/
|
||||||
|
protected void processTextPosition(TextPosition text)
|
||||||
|
{
|
||||||
|
// subclasses can override to provide specific functionality
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -36,7 +36,6 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
|||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
import org.apache.pdfbox.util.Matrix;
|
import org.apache.pdfbox.util.Matrix;
|
||||||
|
|
||||||
@ -125,11 +124,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
// The direction of vertical lines must always be from bottom to top for the table extraction algorithm.
|
// The direction of vertical lines must always be from bottom to top for the table extraction algorithm.
|
||||||
if (pos.getY() > path_y) {
|
if (pos.getY() > path_y) {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos
|
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos.getY())));
|
||||||
.getY())));
|
|
||||||
} else {
|
} else {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos
|
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos.getX(), path_y)));
|
||||||
.getX(), path_y)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
path_x = (float) pos.getX();
|
path_x = (float) pos.getX();
|
||||||
@ -150,25 +147,19 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
Point2D p2 = transformPosition(x + width, y + height);
|
Point2D p2 = transformPosition(x + width, y + height);
|
||||||
|
|
||||||
// Horizontal lines
|
// Horizontal lines
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
||||||
.getX(), (float) p1.getY())));
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2
|
|
||||||
.getX(), (float) p2.getY())));
|
|
||||||
|
|
||||||
// Vertical lines, direction must always be from bottom to top for the table extraction algorithm.
|
// Vertical lines, direction must always be from bottom to top for the table extraction algorithm.
|
||||||
if (p2.getY() > p1.getY()) {
|
if (p2.getY() > p1.getY()) {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
||||||
.getX(), (float) p2.getY())));
|
|
||||||
} else {
|
} else {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
||||||
.getX(), (float) p1.getY())));
|
|
||||||
}
|
}
|
||||||
if (p2.getY() > p1.getY()) {
|
if (p2.getY() > p1.getY()) {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1.getX(), (float) p2.getY())));
|
||||||
.getX(), (float) p2.getY())));
|
|
||||||
} else {
|
} else {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1.getX(), (float) p1.getY())));
|
||||||
.getX(), (float) p1.getY())));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -211,8 +202,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
PDImageXObject image = (PDImageXObject) xobject;
|
PDImageXObject image = (PDImageXObject) xobject;
|
||||||
Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
|
Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
|
||||||
|
|
||||||
Rectangle2D rect = new Rectangle2D.Float(ctmNew.getTranslateX(), ctmNew.getTranslateY(), ctmNew.getScaleX(), ctmNew
|
Rectangle2D rect = new Rectangle2D.Float(ctmNew.getTranslateX(), ctmNew.getTranslateY(), ctmNew.getScaleX(), ctmNew.getScaleY());
|
||||||
.getScaleY());
|
|
||||||
|
|
||||||
// Memory Hack - sofReference kills me
|
// Memory Hack - sofReference kills me
|
||||||
FieldUtils.writeField(image, "cachedImageSubsampling", -1, true);
|
FieldUtils.writeField(image, "cachedImageSubsampling", -1, true);
|
||||||
@ -295,9 +285,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals("\u00A0") || textPositions.get(i)
|
.equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
|
||||||
.getUnicode()
|
|
||||||
.equals("\t"))) {
|
|
||||||
startIndex++;
|
startIndex++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -307,7 +295,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||||
|
.getUnicode()
|
||||||
|
.equals("\t")))) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
@ -318,7 +308,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||||
|
.getUnicode()
|
||||||
|
.equals("\t")))) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
@ -332,7 +324,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||||
|
.getUnicode()
|
||||||
|
.equals("\t")))) {
|
||||||
|
|
||||||
// Remove false sequence ends (whitespaces)
|
// Remove false sequence ends (whitespaces)
|
||||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||||
@ -351,15 +345,15 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
|
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
|
||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
.equals(" ") || sublist.get(sublist.size() - 1)
|
||||||
|
.getUnicode()
|
||||||
|
.equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
||||||
sublist = sublist.subList(0, sublist.size() - 1);
|
sublist = sublist.subList(0, sublist.size() - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
|
||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals("\u00A0") || sublist.get(0)
|
.equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
||||||
.getUnicode()
|
|
||||||
.equals("\t")))) {
|
|
||||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||||
for (TextPosition t : sublist) {
|
for (TextPosition t : sublist) {
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -899,7 +899,7 @@ public class RedactionIntegrationTest {
|
|||||||
@Test
|
@Test
|
||||||
public void redactionTest() throws IOException {
|
public void redactionTest() throws IOException {
|
||||||
|
|
||||||
String fileName = "files/new/VV-511309.pdf";
|
String fileName = "files/new/S416.pdf";
|
||||||
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
|
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user