RED-3114: Fixed unreliable redactions on rotated pages
This commit is contained in:
parent
681d17ad95
commit
f1071de3a9
@ -0,0 +1,377 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.WeakHashMap;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||
import org.apache.fontbox.util.BoundingBox;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.apache.pdfbox.util.Vector;
|
||||
import org.apache.pdfbox.contentstream.operator.DrawObject;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Restore;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Save;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
|
||||
import org.apache.pdfbox.contentstream.operator.text.BeginText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.EndText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.text.MoveText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
|
||||
import org.apache.pdfbox.contentstream.operator.text.NextLine;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
||||
|
||||
/**
|
||||
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
|
||||
*
|
||||
* This class exists only so that we don't break the code of users who have their own subclasses of
|
||||
* PDFTextStripper. It replaces the mostly empty implementation of showGlyph() in PDFStreamEngine
|
||||
* with a heuristic implementation which is backwards compatible.
|
||||
*
|
||||
* DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
|
||||
* THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD.
|
||||
*/
|
||||
@SuppressWarnings({"PMD", "checkstyle:all"})
|
||||
class LegacyPDFStreamEngine extends PDFStreamEngine
|
||||
{
|
||||
private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class);
|
||||
|
||||
private int pageRotation;
|
||||
private PDRectangle pageSize;
|
||||
private Matrix translateMatrix;
|
||||
private final GlyphList glyphList;
|
||||
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*/
|
||||
LegacyPDFStreamEngine() throws IOException
|
||||
{
|
||||
addOperator(new BeginText());
|
||||
addOperator(new Concatenate());
|
||||
addOperator(new DrawObject()); // special text version
|
||||
addOperator(new EndText());
|
||||
addOperator(new SetGraphicsStateParameters());
|
||||
addOperator(new Save());
|
||||
addOperator(new Restore());
|
||||
addOperator(new NextLine());
|
||||
addOperator(new SetCharSpacing());
|
||||
addOperator(new MoveText());
|
||||
addOperator(new MoveTextSetLeading());
|
||||
addOperator(new SetFontAndSize());
|
||||
addOperator(new ShowText());
|
||||
addOperator(new ShowTextAdjusted());
|
||||
addOperator(new SetTextLeading());
|
||||
addOperator(new SetMatrix());
|
||||
addOperator(new SetTextRenderingMode());
|
||||
addOperator(new SetTextRise());
|
||||
addOperator(new SetWordSpacing());
|
||||
addOperator(new SetTextHorizontalScaling());
|
||||
addOperator(new ShowTextLine());
|
||||
addOperator(new ShowTextLineAndSpace());
|
||||
|
||||
// load additional glyph list for Unicode mapping
|
||||
String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt";
|
||||
InputStream input = GlyphList.class.getResourceAsStream(path);
|
||||
glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input);
|
||||
}
|
||||
|
||||
/**
|
||||
* This will initialize and process the contents of the stream.
|
||||
*
|
||||
* @param page the page to process
|
||||
* @throws java.io.IOException if there is an error accessing the stream.
|
||||
*/
|
||||
@Override
|
||||
public void processPage(PDPage page) throws IOException
|
||||
{
|
||||
this.pageRotation = page.getRotation();
|
||||
this.pageSize = page.getCropBox();
|
||||
|
||||
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0)
|
||||
{
|
||||
translateMatrix = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
// translation matrix for cropbox
|
||||
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
|
||||
}
|
||||
super.processPage(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* Called when a glyph is to be processed. The heuristic calculations here were originally
|
||||
* written by Ben Litchfield for PDFStreamEngine.
|
||||
*/
|
||||
@Override
|
||||
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,
|
||||
String unicode,
|
||||
Vector displacement)
|
||||
throws IOException
|
||||
{
|
||||
//
|
||||
// legacy calculations which were previously in PDFStreamEngine
|
||||
//
|
||||
// DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
|
||||
// THIS CODE IS DELIBERATELY INCORRECT
|
||||
//
|
||||
|
||||
PDGraphicsState state = getGraphicsState();
|
||||
Matrix ctm = state.getCurrentTransformationMatrix();
|
||||
float fontSize = state.getTextState().getFontSize();
|
||||
float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
|
||||
Matrix textMatrix = getTextMatrix();
|
||||
|
||||
float displacementX = displacement.getX();
|
||||
// the sorting algorithm is based on the width of the character. As the displacement
|
||||
// for vertical characters doesn't provide any suitable value for it, we have to
|
||||
// calculate our own
|
||||
if (font.isVertical())
|
||||
{
|
||||
displacementX = font.getWidth(code) / 1000;
|
||||
// there may be an additional scaling factor for true type fonts
|
||||
TrueTypeFont ttf = null;
|
||||
if (font instanceof PDTrueTypeFont)
|
||||
{
|
||||
ttf = ((PDTrueTypeFont)font).getTrueTypeFont();
|
||||
}
|
||||
else if (font instanceof PDType0Font)
|
||||
{
|
||||
PDCIDFont cidFont = ((PDType0Font)font).getDescendantFont();
|
||||
if (cidFont instanceof PDCIDFontType2)
|
||||
{
|
||||
ttf = ((PDCIDFontType2)cidFont).getTrueTypeFont();
|
||||
}
|
||||
}
|
||||
if (ttf != null && ttf.getUnitsPerEm() != 1000)
|
||||
{
|
||||
displacementX *= 1000f / ttf.getUnitsPerEm();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// legacy calculations which were previously in PDFStreamEngine
|
||||
//
|
||||
// DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
|
||||
// THIS CODE IS DELIBERATELY INCORRECT
|
||||
//
|
||||
|
||||
// (modified) combined displacement, this is calculated *without* taking the character
|
||||
// spacing and word spacing into account, due to legacy code in TextStripper
|
||||
float tx = displacementX * fontSize * horizontalScaling;
|
||||
float ty = displacement.getY() * fontSize;
|
||||
|
||||
// (modified) combined displacement matrix
|
||||
Matrix td = Matrix.getTranslateInstance(tx, ty);
|
||||
|
||||
// (modified) text rendering matrix
|
||||
Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
|
||||
float nextX = nextTextRenderingMatrix.getTranslateX();
|
||||
float nextY = nextTextRenderingMatrix.getTranslateY();
|
||||
|
||||
// (modified) width and height calculations
|
||||
float dxDisplay = nextX - textRenderingMatrix.getTranslateX();
|
||||
Float fontHeight = fontHeightMap.get(font.getCOSObject());
|
||||
if (fontHeight == null)
|
||||
{
|
||||
fontHeight = computeFontHeight(font);
|
||||
fontHeightMap.put(font.getCOSObject(), fontHeight);
|
||||
}
|
||||
float dyDisplay = fontHeight * textRenderingMatrix.getScalingFactorY();
|
||||
|
||||
//
|
||||
// start of the original method
|
||||
//
|
||||
|
||||
// Note on variable names. There are three different units being used in this code.
|
||||
// Character sizes are given in glyph units, text locations are initially given in text
|
||||
// units, and we want to save the data in display units. The variable names should end with
|
||||
// Text or Disp to represent if the values are in text or disp units (no glyph units are
|
||||
// saved).
|
||||
|
||||
float glyphSpaceToTextSpaceFactor = 1 / 1000f;
|
||||
if (font instanceof PDType3Font)
|
||||
{
|
||||
glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX();
|
||||
}
|
||||
|
||||
float spaceWidthText = 0;
|
||||
try
|
||||
{
|
||||
// to avoid crash as described in PDFBOX-614, see what the space displacement should be
|
||||
spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor;
|
||||
}
|
||||
catch (Throwable exception)
|
||||
{
|
||||
LOG.warn(exception, exception);
|
||||
}
|
||||
|
||||
if (spaceWidthText == 0)
|
||||
{
|
||||
spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor;
|
||||
// the average space width appears to be higher than necessary so make it smaller
|
||||
spaceWidthText *= .80f;
|
||||
}
|
||||
if (spaceWidthText == 0)
|
||||
{
|
||||
spaceWidthText = 1.0f; // if could not find font, use a generic value
|
||||
}
|
||||
|
||||
// the space width has to be transformed into display units
|
||||
float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX();
|
||||
|
||||
// use our additional glyph list for Unicode mapping
|
||||
String unicodeMapping = font.toUnicode(code, glyphList);
|
||||
|
||||
// when there is no Unicode mapping available, Acrobat simply coerces the character code
|
||||
// into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want
|
||||
// this, which is why we leave it until this point in PDFTextStreamEngine.
|
||||
if (unicodeMapping == null)
|
||||
{
|
||||
if (font instanceof PDSimpleFont)
|
||||
{
|
||||
char c = (char) code;
|
||||
unicodeMapping = new String(new char[] { c });
|
||||
}
|
||||
else
|
||||
{
|
||||
// Acrobat doesn't seem to coerce composite font's character codes, instead it
|
||||
// skips them. See the "allah2.pdf" TestTextStripper file.
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// adjust for cropbox if needed
|
||||
Matrix translatedTextRenderingMatrix;
|
||||
if (translateMatrix == null)
|
||||
{
|
||||
translatedTextRenderingMatrix = textRenderingMatrix;
|
||||
}
|
||||
else
|
||||
{
|
||||
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
|
||||
nextX -= pageSize.getLowerLeftX();
|
||||
nextY -= pageSize.getLowerLeftY();
|
||||
}
|
||||
|
||||
processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(),
|
||||
pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY,
|
||||
Math.abs(dyDisplay), dxDisplay,
|
||||
Math.abs(spaceWidthDisplay), unicodeMapping, new int[] { code }, font,
|
||||
fontSize,
|
||||
(int)(fontSize * textMatrix.getScalingFactorX())));
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the font height. Override this if you want to use own calculations.
|
||||
*
|
||||
* @param font the font.
|
||||
* @return the font height.
|
||||
*
|
||||
* @throws IOException if there is an error while getting the font bounding box.
|
||||
*/
|
||||
protected float computeFontHeight(PDFont font) throws IOException
|
||||
{
|
||||
BoundingBox bbox = font.getBoundingBox();
|
||||
if (bbox.getLowerLeftY() < Short.MIN_VALUE)
|
||||
{
|
||||
// PDFBOX-2158 and PDFBOX-3130
|
||||
// files by Salmat eSolutions / ClibPDF Library
|
||||
bbox.setLowerLeftY(- (bbox.getLowerLeftY() + 65536));
|
||||
}
|
||||
// 1/2 the bbox is used as the height todo: why?
|
||||
float glyphHeight = bbox.getHeight() / 2;
|
||||
|
||||
// sometimes the bbox has very high values, but CapHeight is OK
|
||||
PDFontDescriptor fontDescriptor = font.getFontDescriptor();
|
||||
if (fontDescriptor != null)
|
||||
{
|
||||
float capHeight = fontDescriptor.getCapHeight();
|
||||
if (Float.compare(capHeight, 0) != 0 &&
|
||||
(capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0))
|
||||
{
|
||||
glyphHeight = capHeight;
|
||||
}
|
||||
// PDFBOX-3464, PDFBOX-4480, PDFBOX-4553:
|
||||
// sometimes even CapHeight has very high value, but Ascent and Descent are ok
|
||||
float ascent = fontDescriptor.getAscent();
|
||||
float descent = fontDescriptor.getDescent();
|
||||
if (capHeight > ascent && ascent > 0 && descent < 0 &&
|
||||
((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0))
|
||||
{
|
||||
glyphHeight = (ascent - descent) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
// transformPoint from glyph space -> text space
|
||||
float height;
|
||||
if (font instanceof PDType3Font)
|
||||
{
|
||||
height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
|
||||
}
|
||||
else
|
||||
{
|
||||
height = glyphHeight / 1000;
|
||||
}
|
||||
|
||||
return height;
|
||||
}
|
||||
|
||||
/**
|
||||
* A method provided as an event interface to allow a subclass to perform some specific
|
||||
* functionality when text needs to be processed.
|
||||
*
|
||||
* @param text The text to be processed.
|
||||
*/
|
||||
protected void processTextPosition(TextPosition text)
|
||||
{
|
||||
// subclasses can override to provide specific functionality
|
||||
}
|
||||
}
|
||||
@ -36,7 +36,6 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
@ -125,11 +124,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
// The direction of vertical lines must always be from bottom to top for the table extraction algorithm.
|
||||
if (pos.getY() > path_y) {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos
|
||||
.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos.getY())));
|
||||
} else {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos
|
||||
.getX(), path_y)));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos.getX(), path_y)));
|
||||
}
|
||||
|
||||
path_x = (float) pos.getX();
|
||||
@ -150,25 +147,19 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
Point2D p2 = transformPosition(x + width, y + height);
|
||||
|
||||
// Horizontal lines
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2
|
||||
.getX(), (float) p1.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2
|
||||
.getX(), (float) p2.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
||||
|
||||
// Vertical lines, direction must always be from bottom to top for the table extraction algorithm.
|
||||
if (p2.getY() > p1.getY()) {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2
|
||||
.getX(), (float) p2.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
||||
} else {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2
|
||||
.getX(), (float) p1.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
||||
}
|
||||
if (p2.getY() > p1.getY()) {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1
|
||||
.getX(), (float) p2.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1.getX(), (float) p2.getY())));
|
||||
} else {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1
|
||||
.getX(), (float) p1.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1.getX(), (float) p1.getY())));
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -211,8 +202,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
PDImageXObject image = (PDImageXObject) xobject;
|
||||
Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
|
||||
|
||||
Rectangle2D rect = new Rectangle2D.Float(ctmNew.getTranslateX(), ctmNew.getTranslateY(), ctmNew.getScaleX(), ctmNew
|
||||
.getScaleY());
|
||||
Rectangle2D rect = new Rectangle2D.Float(ctmNew.getTranslateX(), ctmNew.getTranslateY(), ctmNew.getScaleX(), ctmNew.getScaleY());
|
||||
|
||||
// Memory Hack - sofReference kills me
|
||||
FieldUtils.writeField(image, "cachedImageSubsampling", -1, true);
|
||||
@ -295,9 +285,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\u00A0") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\t"))) {
|
||||
.equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
|
||||
startIndex++;
|
||||
continue;
|
||||
}
|
||||
@ -307,7 +295,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
@ -318,7 +308,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
@ -332,7 +324,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
|
||||
// Remove false sequence ends (whitespaces)
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
@ -351,15 +345,15 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
||||
.equals(" ") || sublist.get(sublist.size() - 1)
|
||||
.getUnicode()
|
||||
.equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
||||
sublist = sublist.subList(0, sublist.size() - 1);
|
||||
}
|
||||
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
.equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||
for (TextPosition t : sublist) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -899,7 +899,7 @@ public class RedactionIntegrationTest {
|
||||
@Test
|
||||
public void redactionTest() throws IOException {
|
||||
|
||||
String fileName = "files/new/VV-511309.pdf";
|
||||
String fileName = "files/new/S416.pdf";
|
||||
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user