diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java new file mode 100644 index 00000000..4bc6d0a0 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java @@ -0,0 +1,377 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.iqser.red.service.redaction.v1.server.parsing; + +import java.io.InputStream; +import java.io.IOException; +import java.util.Map; +import java.util.WeakHashMap; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.fontbox.ttf.TrueTypeFont; +import org.apache.fontbox.util.BoundingBox; + +import org.apache.pdfbox.contentstream.PDFStreamEngine; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.font.encoding.GlyphList; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDCIDFont; +import org.apache.pdfbox.pdmodel.font.PDCIDFontType2; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDSimpleFont; +import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; +import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.apache.pdfbox.pdmodel.font.PDType3Font; +import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.util.Matrix; +import org.apache.pdfbox.util.Vector; +import org.apache.pdfbox.contentstream.operator.DrawObject; +import org.apache.pdfbox.contentstream.operator.state.Concatenate; +import org.apache.pdfbox.contentstream.operator.state.Restore; +import org.apache.pdfbox.contentstream.operator.state.Save; +import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters; +import org.apache.pdfbox.contentstream.operator.state.SetMatrix; +import org.apache.pdfbox.contentstream.operator.text.BeginText; +import org.apache.pdfbox.contentstream.operator.text.EndText; +import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; +import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling; +import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted; +import org.apache.pdfbox.contentstream.operator.text.ShowTextLine; +import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace; +import org.apache.pdfbox.contentstream.operator.text.MoveText; +import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading; +import org.apache.pdfbox.contentstream.operator.text.NextLine; +import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing; +import org.apache.pdfbox.contentstream.operator.text.SetTextLeading; +import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode; +import org.apache.pdfbox.contentstream.operator.text.SetTextRise; +import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing; +import org.apache.pdfbox.contentstream.operator.text.ShowText; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; + +/** + * LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper. + * + * This class exists only so that we don't break the code of users who have their own subclasses of + * PDFTextStripper. It replaces the mostly empty implementation of showGlyph() in PDFStreamEngine + * with a heuristic implementation which is backwards compatible. + * + * DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. + * THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD. + */ +@SuppressWarnings({"PMD", "checkstyle:all"}) +class LegacyPDFStreamEngine extends PDFStreamEngine +{ + private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class); + + private int pageRotation; + private PDRectangle pageSize; + private Matrix translateMatrix; + private final GlyphList glyphList; + private final Map fontHeightMap = new WeakHashMap(); + + /** + * Constructor. + */ + LegacyPDFStreamEngine() throws IOException + { + addOperator(new BeginText()); + addOperator(new Concatenate()); + addOperator(new DrawObject()); // special text version + addOperator(new EndText()); + addOperator(new SetGraphicsStateParameters()); + addOperator(new Save()); + addOperator(new Restore()); + addOperator(new NextLine()); + addOperator(new SetCharSpacing()); + addOperator(new MoveText()); + addOperator(new MoveTextSetLeading()); + addOperator(new SetFontAndSize()); + addOperator(new ShowText()); + addOperator(new ShowTextAdjusted()); + addOperator(new SetTextLeading()); + addOperator(new SetMatrix()); + addOperator(new SetTextRenderingMode()); + addOperator(new SetTextRise()); + addOperator(new SetWordSpacing()); + addOperator(new SetTextHorizontalScaling()); + addOperator(new ShowTextLine()); + addOperator(new ShowTextLineAndSpace()); + + // load additional glyph list for Unicode mapping + String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt"; + InputStream input = GlyphList.class.getResourceAsStream(path); + glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input); + } + + /** + * This will initialize and process the contents of the stream. + * + * @param page the page to process + * @throws java.io.IOException if there is an error accessing the stream. + */ + @Override + public void processPage(PDPage page) throws IOException + { + this.pageRotation = page.getRotation(); + this.pageSize = page.getCropBox(); + + if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) + { + translateMatrix = null; + } + else + { + // translation matrix for cropbox + translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY()); + } + super.processPage(page); + } + + /** + * Called when a glyph is to be processed. The heuristic calculations here were originally + * written by Ben Litchfield for PDFStreamEngine. + */ + @Override + protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, + String unicode, + Vector displacement) + throws IOException + { + // + // legacy calculations which were previously in PDFStreamEngine + // + // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. + // THIS CODE IS DELIBERATELY INCORRECT + // + + PDGraphicsState state = getGraphicsState(); + Matrix ctm = state.getCurrentTransformationMatrix(); + float fontSize = state.getTextState().getFontSize(); + float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f; + Matrix textMatrix = getTextMatrix(); + + float displacementX = displacement.getX(); + // the sorting algorithm is based on the width of the character. As the displacement + // for vertical characters doesn't provide any suitable value for it, we have to + // calculate our own + if (font.isVertical()) + { + displacementX = font.getWidth(code) / 1000; + // there may be an additional scaling factor for true type fonts + TrueTypeFont ttf = null; + if (font instanceof PDTrueTypeFont) + { + ttf = ((PDTrueTypeFont)font).getTrueTypeFont(); + } + else if (font instanceof PDType0Font) + { + PDCIDFont cidFont = ((PDType0Font)font).getDescendantFont(); + if (cidFont instanceof PDCIDFontType2) + { + ttf = ((PDCIDFontType2)cidFont).getTrueTypeFont(); + } + } + if (ttf != null && ttf.getUnitsPerEm() != 1000) + { + displacementX *= 1000f / ttf.getUnitsPerEm(); + } + } + + // + // legacy calculations which were previously in PDFStreamEngine + // + // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. + // THIS CODE IS DELIBERATELY INCORRECT + // + + // (modified) combined displacement, this is calculated *without* taking the character + // spacing and word spacing into account, due to legacy code in TextStripper + float tx = displacementX * fontSize * horizontalScaling; + float ty = displacement.getY() * fontSize; + + // (modified) combined displacement matrix + Matrix td = Matrix.getTranslateInstance(tx, ty); + + // (modified) text rendering matrix + Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space + float nextX = nextTextRenderingMatrix.getTranslateX(); + float nextY = nextTextRenderingMatrix.getTranslateY(); + + // (modified) width and height calculations + float dxDisplay = nextX - textRenderingMatrix.getTranslateX(); + Float fontHeight = fontHeightMap.get(font.getCOSObject()); + if (fontHeight == null) + { + fontHeight = computeFontHeight(font); + fontHeightMap.put(font.getCOSObject(), fontHeight); + } + float dyDisplay = fontHeight * textRenderingMatrix.getScalingFactorY(); + + // + // start of the original method + // + + // Note on variable names. There are three different units being used in this code. + // Character sizes are given in glyph units, text locations are initially given in text + // units, and we want to save the data in display units. The variable names should end with + // Text or Disp to represent if the values are in text or disp units (no glyph units are + // saved). + + float glyphSpaceToTextSpaceFactor = 1 / 1000f; + if (font instanceof PDType3Font) + { + glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX(); + } + + float spaceWidthText = 0; + try + { + // to avoid crash as described in PDFBOX-614, see what the space displacement should be + spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor; + } + catch (Throwable exception) + { + LOG.warn(exception, exception); + } + + if (spaceWidthText == 0) + { + spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor; + // the average space width appears to be higher than necessary so make it smaller + spaceWidthText *= .80f; + } + if (spaceWidthText == 0) + { + spaceWidthText = 1.0f; // if could not find font, use a generic value + } + + // the space width has to be transformed into display units + float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX(); + + // use our additional glyph list for Unicode mapping + String unicodeMapping = font.toUnicode(code, glyphList); + + // when there is no Unicode mapping available, Acrobat simply coerces the character code + // into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want + // this, which is why we leave it until this point in PDFTextStreamEngine. + if (unicodeMapping == null) + { + if (font instanceof PDSimpleFont) + { + char c = (char) code; + unicodeMapping = new String(new char[] { c }); + } + else + { + // Acrobat doesn't seem to coerce composite font's character codes, instead it + // skips them. See the "allah2.pdf" TestTextStripper file. + return; + } + } + + // adjust for cropbox if needed + Matrix translatedTextRenderingMatrix; + if (translateMatrix == null) + { + translatedTextRenderingMatrix = textRenderingMatrix; + } + else + { + translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix); + nextX -= pageSize.getLowerLeftX(); + nextY -= pageSize.getLowerLeftY(); + } + + processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(), + pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY, + Math.abs(dyDisplay), dxDisplay, + Math.abs(spaceWidthDisplay), unicodeMapping, new int[] { code }, font, + fontSize, + (int)(fontSize * textMatrix.getScalingFactorX()))); + } + + /** + * Compute the font height. Override this if you want to use own calculations. + * + * @param font the font. + * @return the font height. + * + * @throws IOException if there is an error while getting the font bounding box. + */ + protected float computeFontHeight(PDFont font) throws IOException + { + BoundingBox bbox = font.getBoundingBox(); + if (bbox.getLowerLeftY() < Short.MIN_VALUE) + { + // PDFBOX-2158 and PDFBOX-3130 + // files by Salmat eSolutions / ClibPDF Library + bbox.setLowerLeftY(- (bbox.getLowerLeftY() + 65536)); + } + // 1/2 the bbox is used as the height todo: why? + float glyphHeight = bbox.getHeight() / 2; + + // sometimes the bbox has very high values, but CapHeight is OK + PDFontDescriptor fontDescriptor = font.getFontDescriptor(); + if (fontDescriptor != null) + { + float capHeight = fontDescriptor.getCapHeight(); + if (Float.compare(capHeight, 0) != 0 && + (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) + { + glyphHeight = capHeight; + } + // PDFBOX-3464, PDFBOX-4480, PDFBOX-4553: + // sometimes even CapHeight has very high value, but Ascent and Descent are ok + float ascent = fontDescriptor.getAscent(); + float descent = fontDescriptor.getDescent(); + if (capHeight > ascent && ascent > 0 && descent < 0 && + ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) + { + glyphHeight = (ascent - descent) / 2; + } + } + + // transformPoint from glyph space -> text space + float height; + if (font instanceof PDType3Font) + { + height = font.getFontMatrix().transformPoint(0, glyphHeight).y; + } + else + { + height = glyphHeight / 1000; + } + + return height; + } + + /** + * A method provided as an event interface to allow a subclass to perform some specific + * functionality when text needs to be processed. + * + * @param text The text to be processed. + */ + protected void processTextPosition(TextPosition text) + { + // subclasses can override to provide specific functionality + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java index 95fdad6a..6e46257d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java @@ -36,7 +36,6 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.util.Matrix; @@ -125,11 +124,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { // The direction of vertical lines must always be from bottom to top for the table extraction algorithm. if (pos.getY() > path_y) { - graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos - .getY()))); + graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos.getY()))); } else { - graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos - .getX(), path_y))); + graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos.getX(), path_y))); } path_x = (float) pos.getX(); @@ -150,25 +147,19 @@ public class PDFLinesTextStripper extends PDFTextStripper { Point2D p2 = transformPosition(x + width, y + height); // Horizontal lines - graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2 - .getX(), (float) p1.getY()))); - graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2 - .getX(), (float) p2.getY()))); + graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY()))); + graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY()))); // Vertical lines, direction must always be from bottom to top for the table extraction algorithm. if (p2.getY() > p1.getY()) { - graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2 - .getX(), (float) p2.getY()))); + graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY()))); } else { - graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2 - .getX(), (float) p1.getY()))); + graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY()))); } if (p2.getY() > p1.getY()) { - graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1 - .getX(), (float) p2.getY()))); + graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1.getX(), (float) p2.getY()))); } else { - graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1 - .getX(), (float) p1.getY()))); + graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1.getX(), (float) p1.getY()))); } } break; @@ -211,8 +202,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { PDImageXObject image = (PDImageXObject) xobject; Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix(); - Rectangle2D rect = new Rectangle2D.Float(ctmNew.getTranslateX(), ctmNew.getTranslateY(), ctmNew.getScaleX(), ctmNew - .getScaleY()); + Rectangle2D rect = new Rectangle2D.Float(ctmNew.getTranslateX(), ctmNew.getTranslateY(), ctmNew.getScaleX(), ctmNew.getScaleY()); // Memory Hack - sofReference kills me FieldUtils.writeField(image, "cachedImageSubsampling", -1, true); @@ -295,9 +285,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) .getUnicode() - .equals("\u00A0") || textPositions.get(i) - .getUnicode() - .equals("\t"))) { + .equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) { startIndex++; continue; } @@ -307,7 +295,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { List sublist = textPositions.subList(startIndex, i); if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) .getUnicode() - .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) { + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t")))) { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); } startIndex = i; @@ -318,7 +308,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { List sublist = textPositions.subList(startIndex, i); if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) .getUnicode() - .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) { + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t")))) { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); } startIndex = i; @@ -332,7 +324,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { List sublist = textPositions.subList(startIndex, i); if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) .getUnicode() - .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) { + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t")))) { // Remove false sequence ends (whitespaces) if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) @@ -351,15 +345,15 @@ public class PDFLinesTextStripper extends PDFTextStripper { List sublist = textPositions.subList(startIndex, textPositions.size()); if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1) .getUnicode() - .equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) { + .equals(" ") || sublist.get(sublist.size() - 1) + .getUnicode() + .equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) { sublist = sublist.subList(0, sublist.size() - 1); } if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0) .getUnicode() - .equals("\u00A0") || sublist.get(0) - .getUnicode() - .equals("\t")))) { + .equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) { if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { for (TextPosition t : sublist) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java new file mode 100644 index 00000000..d6430715 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java @@ -0,0 +1,2182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.iqser.red.service.redaction.v1.server.parsing; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.io.StringWriter; +import java.io.Writer; +import java.text.Bidi; +import java.text.Normalizer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.StringTokenizer; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; +import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.text.TextPositionComparator; +import org.apache.pdfbox.util.QuickSort; + +/** + * This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox. + * see S416.pdf + * */ +@SuppressWarnings({"PMD", "checkstyle:all"}) +public class PDFTextStripper extends LegacyPDFStreamEngine +{ + + private static float defaultIndentThreshold = 2.0f; + private static float defaultDropThreshold = 2.5f; + private static final boolean useCustomQuickSort; + + private static final Log LOG = LogFactory.getLog(PDFTextStripper.class); + + // enable the ability to set the default indent/drop thresholds + // with -D system properties: + // pdftextstripper.indent + // pdftextstripper.drop + static + { + String strDrop = null, strIndent = null; + try + { + String className = PDFTextStripper.class.getSimpleName().toLowerCase(); + String prop = className + ".indent"; + strIndent = System.getProperty(prop); + prop = className + ".drop"; + strDrop = System.getProperty(prop); + } + catch (SecurityException e) + { + // PDFBOX-1946 when run in an applet + // ignore and use default + } + if (strIndent != null && strIndent.length() > 0) + { + try + { + defaultIndentThreshold = Float.parseFloat(strIndent); + } + catch (NumberFormatException nfe) + { + // ignore and use default + } + } + if (strDrop != null && strDrop.length() > 0) + { + try + { + defaultDropThreshold = Float.parseFloat(strDrop); + } + catch (NumberFormatException nfe) + { + // ignore and use default + } + } + } + + static + { + // check if we need to use the custom quicksort algorithm as a + // workaround to the PDFBOX-1512 transitivity issue of TextPositionComparator: + boolean is16orLess = false; + try + { + String version = System.getProperty("java.specification.version"); + StringTokenizer st = new StringTokenizer(version, "."); + int majorVersion = Integer.parseInt(st.nextToken()); + int minorVersion = 0; + if (st.hasMoreTokens()) + { + minorVersion = Integer.parseInt(st.nextToken()); + } + is16orLess = majorVersion == 1 && minorVersion <= 6; + } + catch (SecurityException x) + { + // when run in an applet ignore and use default + // assume 1.7 or higher so that quicksort is used + } + catch (NumberFormatException nfe) + { + // should never happen, but if it does, + // assume 1.7 or higher so that quicksort is used + } + useCustomQuickSort = !is16orLess; + } + + /** + * The platform's line separator. + */ + protected final String LINE_SEPARATOR = System.getProperty("line.separator"); + + private String lineSeparator = LINE_SEPARATOR; + private String wordSeparator = " "; + private String paragraphStart = ""; + private String paragraphEnd = ""; + private String pageStart = ""; + private String pageEnd = LINE_SEPARATOR; + private String articleStart = ""; + private String articleEnd = ""; + + private int currentPageNo = 0; + private int startPage = 1; + private int endPage = Integer.MAX_VALUE; + private PDOutlineItem startBookmark = null; + + // 1-based bookmark pages + private int startBookmarkPageNumber = -1; + private int endBookmarkPageNumber = -1; + + private PDOutlineItem endBookmark = null; + private boolean suppressDuplicateOverlappingText = true; + private boolean shouldSeparateByBeads = true; + private boolean sortByPosition = false; + private boolean addMoreFormatting = false; + + private float indentThreshold = defaultIndentThreshold; + private float dropThreshold = defaultDropThreshold; + + // we will need to estimate where to add spaces, these are used to help guess + private float spacingTolerance = .5f; + private float averageCharTolerance = .3f; + + private List beadRectangles = null; + + /** + * The charactersByArticle is used to extract text by article divisions. For example a PDF that has two columns like + * a newspaper, we want to extract the first column and then the second column. In this example the PDF would have 2 + * beads(or articles), one for each column. The size of the charactersByArticle would be 5, because not all text on + * the screen will fall into one of the articles. The five divisions are shown below + * + * Text before first article + * first article text + * text between first article and second article + * second article text + * text after second article + * + * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. + */ + protected ArrayList> charactersByArticle = new ArrayList>(); + + private Map>> characterListMapping = new HashMap>>(); + + protected PDDocument document; + protected Writer output; + + /** + * True if we started a paragraph but haven't ended it yet. + */ + private boolean inParagraph; + + /** + * Instantiate a new PDFTextStripper object. + * + * @throws IOException If there is an error loading the properties. + */ + public PDFTextStripper() throws IOException + { + } + + /** + * This will return the text of a document. See writeText.
+ * NOTE: The document must not be encrypted when coming into this method. + * + *

IMPORTANT: By default, text extraction is done in the same sequence as the text in the PDF page content stream. + * PDF is a graphic format, not a text format, and unlike HTML, it has no requirements that text one on page + * be rendered in a certain order. The order is the one that was determined by the software that created the + * PDF. To get text sorted from left to right and top to botton, use {@link #setSortByPosition(boolean)}. + * + * @param doc The document to get the text from. + * @return The text of the PDF document. + * @throws IOException if the doc state is invalid or it is encrypted. + */ + public String getText(PDDocument doc) throws IOException + { + StringWriter outputStream = new StringWriter(); + writeText(doc, outputStream); + return outputStream.toString(); + } + + private void resetEngine() + { + currentPageNo = 0; + document = null; + if (charactersByArticle != null) + { + charactersByArticle.clear(); + } + characterListMapping.clear(); + } + + /** + * This will take a PDDocument and write the text of that document to the print writer. + * + * @param doc The document to get the data from. + * @param outputStream The location to put the text. + * + * @throws IOException If the doc is in an invalid state. + */ + public void writeText(PDDocument doc, Writer outputStream) throws IOException + { + resetEngine(); + document = doc; + output = outputStream; + if (getAddMoreFormatting()) + { + paragraphEnd = lineSeparator; + pageStart = lineSeparator; + articleStart = lineSeparator; + articleEnd = lineSeparator; + } + startDocument(document); + processPages(document.getPages()); + endDocument(document); + } + + /** + * This will process all of the pages and the text that is in them. + * + * @param pages The pages object in the document. + * + * @throws IOException If there is an error parsing the text. + */ + protected void processPages(PDPageTree pages) throws IOException + { + PDPage startBookmarkPage = startBookmark == null ? null + : startBookmark.findDestinationPage(document); + if (startBookmarkPage != null) + { + startBookmarkPageNumber = pages.indexOf(startBookmarkPage) + 1; + } + else + { + // -1 = undefined + startBookmarkPageNumber = -1; + } + + PDPage endBookmarkPage = endBookmark == null ? null + : endBookmark.findDestinationPage(document); + if (endBookmarkPage != null) + { + endBookmarkPageNumber = pages.indexOf(endBookmarkPage) + 1; + } + else + { + // -1 = undefined + endBookmarkPageNumber = -1; + } + + if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 + && endBookmark != null + && startBookmark.getCOSObject() == endBookmark.getCOSObject()) + { + // this is a special case where both the start and end bookmark + // are the same but point to nothing. In this case + // we will not extract any text. + startBookmarkPageNumber = 0; + endBookmarkPageNumber = 0; + } + + for (PDPage page : pages) + { + currentPageNo++; + if (page.hasContents()) + { + processPage(page); + } + } + } + + /** + * This method is available for subclasses of this class. It will be called before processing of the document start. + * + * @param document The PDF document that is being processed. + * @throws IOException If an IO error occurs. + */ + protected void startDocument(PDDocument document) throws IOException + { + // no default implementation, but available for subclasses + } + + /** + * This method is available for subclasses of this class. It will be called after processing of the document + * finishes. + * + * @param document The PDF document that is being processed. + * @throws IOException If an IO error occurs. + */ + protected void endDocument(PDDocument document) throws IOException + { + // no default implementation, but available for subclasses + } + + /** + * This will process the contents of a page. + * + * @param page The page to process. + * + * @throws IOException If there is an error processing the page. + */ + @Override + public void processPage(PDPage page) throws IOException + { + if (currentPageNo >= startPage && currentPageNo <= endPage + && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) + && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) + { + startPage(page); + + int numberOfArticleSections = 1; + if (shouldSeparateByBeads) + { + fillBeadRectangles(page); + numberOfArticleSections += beadRectangles.size() * 2; + } + int originalSize = charactersByArticle.size(); + charactersByArticle.ensureCapacity(numberOfArticleSections); + int lastIndex = Math.max(numberOfArticleSections, originalSize); + for (int i = 0; i < lastIndex; i++) + { + if (i < originalSize) + { + charactersByArticle.get(i).clear(); + } + else + { + if (numberOfArticleSections < originalSize) + { + charactersByArticle.remove(i); + } + else + { + charactersByArticle.add(new ArrayList()); + } + } + } + characterListMapping.clear(); + super.processPage(page); + writePage(); + endPage(page); + } + } + + private void fillBeadRectangles(PDPage page) + { + beadRectangles = new ArrayList(); + for (PDThreadBead bead : page.getThreadBeads()) + { + if (bead == null || bead.getRectangle() == null) + { + // can't skip, because of null entry handling in processTextPosition() + beadRectangles.add(null); + continue; + } + + PDRectangle rect = bead.getRectangle(); + + // bead rectangle is in PDF coordinates (y=0 is bottom), + // glyphs are in image coordinates (y=0 is top), + // so we must flip + PDRectangle mediaBox = page.getMediaBox(); + float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY(); + float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY(); + rect.setLowerLeftY(lowerLeftY); + rect.setUpperRightY(upperRightY); + + // adjust for cropbox + PDRectangle cropBox = page.getCropBox(); + if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0) + { + rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX()); + rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY()); + rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX()); + rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY()); + } + + beadRectangles.add(rect); + } + } + + /** + * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). This + * assumes that the primary direction of text is left to right. Default implementation is to do nothing. Subclasses + * may provide additional information. + * + * @throws IOException If there is any error writing to the stream. + */ + protected void startArticle() throws IOException + { + startArticle(true); + } + + /** + * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). + * Default implementation is to do nothing. Subclasses may provide additional information. + * + * @param isLTR true if primary direction of text is left to right. + * @throws IOException If there is any error writing to the stream. + */ + protected void startArticle(boolean isLTR) throws IOException + { + output.write(getArticleStart()); + } + + /** + * End an article. Default implementation is to do nothing. Subclasses may provide additional information. + * + * @throws IOException If there is any error writing to the stream. + */ + protected void endArticle() throws IOException + { + output.write(getArticleEnd()); + } + + /** + * Start a new page. Default implementation is to do nothing. Subclasses may provide additional information. + * + * @param page The page we are about to process. + * + * @throws IOException If there is any error writing to the stream. + */ + protected void startPage(PDPage page) throws IOException + { + // default is to do nothing + } + + /** + * End a page. Default implementation is to do nothing. Subclasses may provide additional information. + * + * @param page The page we are about to process. + * + * @throws IOException If there is any error writing to the stream. + */ + protected void endPage(PDPage page) throws IOException + { + // default is to do nothing + } + + private static final float END_OF_LAST_TEXT_X_RESET_VALUE = -1; + private static final float MAX_Y_FOR_LINE_RESET_VALUE = -Float.MAX_VALUE; + private static final float EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE = -Float.MAX_VALUE; + private static final float MAX_HEIGHT_FOR_LINE_RESET_VALUE = -1; + private static final float MIN_Y_TOP_FOR_LINE_RESET_VALUE = Float.MAX_VALUE; + private static final float LAST_WORD_SPACING_RESET_VALUE = -1; + + /** + * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the + * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was + * enabled. + * + * @throws IOException If there is an error writing the text. + */ + protected void writePage() throws IOException + { + float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; + float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; + float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE; + float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE; + float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; + PositionWrapper lastPosition = null; + PositionWrapper lastLineStartPosition = null; + + boolean startOfPage = true; // flag to indicate start of page + boolean startOfArticle; + if (charactersByArticle.size() > 0) + { + writePageStart(); + } + + for (List textList : charactersByArticle) + { + if (getSortByPosition()) + { + TextPositionComparator comparator = new TextPositionComparator(); + + // because the TextPositionComparator is not transitive, but + // JDK7+ enforces transitivity on comparators, we need to use + // a custom quicksort implementation (which is slower, unfortunately). + if (useCustomQuickSort) + { + QuickSort.sort(textList, comparator); + } + else + { + Collections.sort(textList, comparator); + } + } + + startArticle(); + startOfArticle = true; + + // Now cycle through to print the text. + // We queue up a line at a time before we print so that we can convert + // the line from presentation form to logical form (if needed). + List line = new ArrayList(); + + Iterator textIter = textList.iterator(); + // PDF files don't always store spaces. We will need to guess where we should add + // spaces based on the distances between TextPositions. Historically, this was done + // based on the size of the space character provided by the font. In general, this + // worked but there were cases where it did not work. Calculating the average character + // width and using that as a metric works better in some cases but fails in some cases + // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of + // these examples. + + // Keeps track of the previous average character width + float previousAveCharWidth = -1; + while (textIter.hasNext()) + { + TextPosition position = textIter.next(); + PositionWrapper current = new PositionWrapper(position); + String characterValue = position.getUnicode(); + + // Resets the average character width when we see a change in font + // or a change in the font size + if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition() + .getFont() + || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) + { + previousAveCharWidth = -1; + } + + float positionX; + float positionY; + float positionWidth; + float positionHeight; + + + // If we are sorting, then we need to use the text direction + // adjusted coordinates, because they were used in the sorting. +// if (getSortByPosition()) +// { + positionX = position.getXDirAdj(); + positionY = position.getYDirAdj(); + positionWidth = position.getWidthDirAdj(); + positionHeight = position.getHeightDir(); +// } +// else +// { +// positionX = position.getX(); +// positionY = position.getY(); +// positionWidth = position.getWidth(); +// positionHeight = position.getHeight(); +// } + + // The current amount of characters in a word + int wordCharCount = position.getIndividualWidths().length; + + // Estimate the expected width of the space based on the + // space character with some margin. + float wordSpacing = position.getWidthOfSpace(); + float deltaSpace; + if (wordSpacing == 0 || Float.isNaN(wordSpacing)) + { + deltaSpace = Float.MAX_VALUE; + } + else + { + if (lastWordSpacing < 0) + { + deltaSpace = wordSpacing * getSpacingTolerance(); + } + else + { + deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance(); + } + } + + // Estimate the expected width of the space based on the average character width + // with some margin. This calculation does not make a true average (average of + // averages) but we found that it gave the best results after numerous experiments. + // Based on experiments we also found that .3 worked well. + float averageCharWidth; + if (previousAveCharWidth < 0) + { + averageCharWidth = positionWidth / wordCharCount; + } + else + { + averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f; + } + float deltaCharWidth = averageCharWidth * getAverageCharTolerance(); + + // Compares the values obtained by the average method and the wordSpacing method + // and picks the smaller number. + float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; + if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) + { + expectedStartOfNextWordX = endOfLastTextX + Math.min(deltaSpace, deltaCharWidth); + } + + if (lastPosition != null) + { + if (startOfArticle) + { + lastPosition.setArticleStart(); + startOfArticle = false; + } + // RDD - Here we determine whether this text object is on the current + // line. We use the lastBaselineFontSize to handle the superscript + // case, and the size of the current font to handle the subscript case. + // Text must overlap with the last rendered baseline text by at least + // a small amount in order to be considered as being on the same line. + + // XXX BC: In theory, this check should really check if the next char is in + // full range seen in this line. This is what I tried to do with minYTopForLine, + // but this caused a lot of regression test failures. So, I'm leaving it be for + // now + if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) + { + writeLine(normalize(line)); + line.clear(); + lastLineStartPosition = handleLineSeparation(current, lastPosition, + lastLineStartPosition, maxHeightForLine); + expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; + maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; + maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; + minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; + } + // test if our TextPosition starts after a new word would be expected to start + if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE + && expectedStartOfNextWordX < positionX + // only bother adding a word separator if the last character was not a word separator + && (wordSeparator.isEmpty() || // + (lastPosition.getTextPosition().getUnicode() != null + && !lastPosition.getTextPosition().getUnicode() + .endsWith(wordSeparator)))) + { + line.add(LineItem.getWordSeparator()); + } + // if there is at least the equivalent of one space + // between the last character and the current one, + // reset the max line height as the font size may have completely changed + if (Math.abs(position.getX() + - lastPosition.getTextPosition().getX()) > (wordSpacing + deltaSpace)) + { + maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; + maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; + minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; + } + } + if (positionY >= maxYForLine) + { + maxYForLine = positionY; + } + // RDD - endX is what PDF considers to be the x coordinate of the + // end position of the text. We use it in computing our metrics below. + endOfLastTextX = positionX + positionWidth; + + // add it to the list + if (characterValue != null) + { + if (startOfPage && lastPosition == null) + { + writeParagraphStart();// not sure this is correct for RTL? + } + line.add(new LineItem(position)); + } + maxHeightForLine = Math.max(maxHeightForLine, positionHeight); + minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); + lastPosition = current; + if (startOfPage) + { + lastPosition.setParagraphStart(); + lastPosition.setLineStart(); + lastLineStartPosition = lastPosition; + startOfPage = false; + } + lastWordSpacing = wordSpacing; + previousAveCharWidth = averageCharWidth; + } + // print the final line + if (line.size() > 0) + { + writeLine(normalize(line)); + writeParagraphEnd(); + } + endArticle(); + } + writePageEnd(); + } + + private boolean overlap(float y1, float height1, float y2, float height2) + { + return within(y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1 + || y1 <= y2 && y1 >= y2 - height2; + } + + /** + * Write the line separator value to the output stream. + * + * @throws IOException If there is a problem writing out the line separator to the document. + */ + protected void writeLineSeparator() throws IOException + { + output.write(getLineSeparator()); + } + + /** + * Write the word separator value to the output stream. + * + * @throws IOException If there is a problem writing out the word separator to the document. + */ + protected void writeWordSeparator() throws IOException + { + output.write(getWordSeparator()); + } + + /** + * Write the string in TextPosition to the output stream. + * + * @param text The text to write to the stream. + * @throws IOException If there is an error when writing the text. + */ + protected void writeCharacters(TextPosition text) throws IOException + { + output.write(text.getUnicode()); + } + + /** + * Write a Java string to the output stream. The default implementation will ignore the textPositions + * and just calls {@link #writeString(String)}. + * + * @param text The text to write to the stream. + * @param textPositions The TextPositions belonging to the text. + * @throws IOException If there is an error when writing the text. + */ + protected void writeString(String text, List textPositions) throws IOException + { + writeString(text); + } + + /** + * Write a Java string to the output stream. + * + * @param text The text to write to the stream. + * @throws IOException If there is an error when writing the text. + */ + protected void writeString(String text) throws IOException + { + output.write(text); + } + + /** + * This will determine of two floating point numbers are within a specified variance. + * + * @param first The first number to compare to. + * @param second The second number to compare to. + * @param variance The allowed variance. + */ + private boolean within(float first, float second, float variance) + { + return second < first + variance && second > first - variance; + } + + /** + * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of + * overlapping text. + * + * @param text The text to process. + */ + @Override + protected void processTextPosition(TextPosition text) + { + boolean showCharacter = true; + if (suppressDuplicateOverlappingText) + { + showCharacter = false; + String textCharacter = text.getUnicode(); + float textX = text.getX(); + float textY = text.getY(); + TreeMap> sameTextCharacters = characterListMapping + .get(textCharacter); + if (sameTextCharacters == null) + { + sameTextCharacters = new TreeMap>(); + characterListMapping.put(textCharacter, sameTextCharacters); + } + // RDD - Here we compute the value that represents the end of the rendered + // text. This value is used to determine whether subsequent text rendered + // on the same line overwrites the current text. + // + // We subtract any positive padding to handle cases where extreme amounts + // of padding are applied, then backed off (not sure why this is done, but there + // are cases where the padding is on the order of 10x the character width, and + // the TJ just backs up to compensate after each character). Also, we subtract + // an amount to allow for kerning (a percentage of the width of the last + // character). + boolean suppressCharacter = false; + float tolerance = text.getWidth() / textCharacter.length() / 3.0f; + + SortedMap> xMatches = sameTextCharacters.subMap(textX - tolerance, + textX + tolerance); + for (TreeSet xMatch : xMatches.values()) + { + SortedSet yMatches = xMatch.subSet(textY - tolerance, textY + tolerance); + if (!yMatches.isEmpty()) + { + suppressCharacter = true; + break; + } + } + if (!suppressCharacter) + { + TreeSet ySet = sameTextCharacters.get(textX); + if (ySet == null) + { + ySet = new TreeSet(); + sameTextCharacters.put(textX, ySet); + } + ySet.add(textY); + showCharacter = true; + } + } + if (showCharacter) + { + // if we are showing the character then we need to determine which article it belongs to + int foundArticleDivisionIndex = -1; + int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; + int notFoundButFirstLeftArticleDivisionIndex = -1; + int notFoundButFirstAboveArticleDivisionIndex = -1; + float x = text.getX(); + float y = text.getY(); + if (shouldSeparateByBeads) + { + for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) + { + PDRectangle rect = beadRectangles.get(i); + if (rect != null) + { + if (rect.contains(x, y)) + { + foundArticleDivisionIndex = i * 2 + 1; + } + else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) + && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) + { + notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; + } + else if (x < rect.getLowerLeftX() + && notFoundButFirstLeftArticleDivisionIndex == -1) + { + notFoundButFirstLeftArticleDivisionIndex = i * 2; + } + else if (y < rect.getUpperRightY() + && notFoundButFirstAboveArticleDivisionIndex == -1) + { + notFoundButFirstAboveArticleDivisionIndex = i * 2; + } + } + else + { + foundArticleDivisionIndex = 0; + } + } + } + else + { + foundArticleDivisionIndex = 0; + } + int articleDivisionIndex; + if (foundArticleDivisionIndex != -1) + { + articleDivisionIndex = foundArticleDivisionIndex; + } + else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) + { + articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; + } + else if (notFoundButFirstLeftArticleDivisionIndex != -1) + { + articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; + } + else if (notFoundButFirstAboveArticleDivisionIndex != -1) + { + articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; + } + else + { + articleDivisionIndex = charactersByArticle.size() - 1; + } + + List textList = charactersByArticle.get(articleDivisionIndex); + + // In the wild, some PDF encoded documents put diacritics (accents on + // top of characters) into a separate Tj element. When displaying them + // graphically, the two chunks get overlaid. With text output though, + // we need to do the overlay. This code recombines the diacritic with + // its associated character if the two are consecutive. + if (textList.isEmpty()) + { + textList.add(text); + } + else + { + // test if we overlap the previous entry. + // Note that we are making an assumption that we need to only look back + // one TextPosition to find what we are overlapping. + // This may not always be true. */ + TextPosition previousTextPosition = textList.get(textList.size() - 1); + if (text.isDiacritic() && previousTextPosition.contains(text)) + { + previousTextPosition.mergeDiacritic(text); + } + // If the previous TextPosition was the diacritic, merge it into this + // one and remove it from the list. + else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) + { + text.mergeDiacritic(previousTextPosition); + textList.remove(textList.size() - 1); + textList.add(text); + } + else + { + textList.add(text); + } + } + } + } + + /** + * This is the page that the text extraction will start on. The pages start at page 1. For example in a 5 page PDF + * document, if the start page is 1 then all pages will be extracted. If the start page is 4 then pages 4 and 5 will + * be extracted. The default value is 1. + * + * @return Value of property startPage. + */ + public int getStartPage() + { + return startPage; + } + + /** + * This will set the first page to be extracted by this class. + * + * @param startPageValue New value of 1-based startPage property. + */ + public void setStartPage(int startPageValue) + { + startPage = startPageValue; + } + + /** + * This will get the last page that will be extracted. This is inclusive, for example if a 5 page PDF an endPage + * value of 5 would extract the entire document, an end page of 2 would extract pages 1 and 2. This defaults to + * Integer.MAX_VALUE such that all pages of the pdf will be extracted. + * + * @return Value of property endPage. + */ + public int getEndPage() + { + return endPage; + } + + /** + * This will set the last page to be extracted by this class. + * + * @param endPageValue New value of 1-based endPage property. + */ + public void setEndPage(int endPageValue) + { + endPage = endPageValue; + } + + /** + * Set the desired line separator for output text. The line.separator system property is used if the line separator + * preference is not set explicitly using this method. + * + * @param separator The desired line separator string. + */ + public void setLineSeparator(String separator) + { + lineSeparator = separator; + } + + /** + * This will get the line separator. + * + * @return The desired line separator string. + */ + public String getLineSeparator() + { + return lineSeparator; + } + + /** + * This will get the word separator. + * + * @return The desired word separator string. + */ + public String getWordSeparator() + { + return wordSeparator; + } + + /** + * Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space + * character if there is enough space between two words. By default a space character is used. If you need and + * accurate count of characters that are found in a PDF document then you might want to set the word separator to + * the empty string. + * + * @param separator The desired page separator string. + */ + public void setWordSeparator(String separator) + { + wordSeparator = separator; + } + + /** + * @return Returns the suppressDuplicateOverlappingText. + */ + public boolean getSuppressDuplicateOverlappingText() + { + return suppressDuplicateOverlappingText; + } + + /** + * Get the current page number that is being processed. + * + * @return A 1 based number representing the current page. + */ + protected int getCurrentPageNo() + { + return currentPageNo; + } + + /** + * The output stream that is being written to. + * + * @return The stream that output is being written to. + */ + protected Writer getOutput() + { + return output; + } + + /** + * Character strings are grouped by articles. It is quite common that there will only be a single article. This + * returns a List that contains List objects, the inner lists will contain TextPosition objects. + * + * @return A double List of TextPositions for all text strings on the page. + */ + protected List> getCharactersByArticle() + { + return charactersByArticle; + } + + /** + * By default the text stripper will attempt to remove text that overlapps each other. Word paints the same + * character several times in order to make it look bold. By setting this to false all text will be extracted, which + * means that certain sections will be duplicated, but better performance will be noticed. + * + * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set. + */ + public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) + { + suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; + } + + /** + * This will tell if the text stripper should separate by beads. + * + * @return If the text will be grouped by beads. + */ + public boolean getSeparateByBeads() + { + return shouldSeparateByBeads; + } + + /** + * Set if the text stripper should group the text output by a list of beads. The default value is true! + * + * @param aShouldSeparateByBeads The new grouping of beads. + */ + public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) + { + shouldSeparateByBeads = aShouldSeparateByBeads; + } + + /** + * Get the bookmark where text extraction should end, inclusive. Default is null. + * + * @return The ending bookmark. + */ + public PDOutlineItem getEndBookmark() + { + return endBookmark; + } + + /** + * Set the bookmark where the text extraction should stop. + * + * @param aEndBookmark The ending bookmark. + */ + public void setEndBookmark(PDOutlineItem aEndBookmark) + { + endBookmark = aEndBookmark; + } + + /** + * Get the bookmark where text extraction should start, inclusive. Default is null. + * + * @return The starting bookmark. + */ + public PDOutlineItem getStartBookmark() + { + return startBookmark; + } + + /** + * Set the bookmark where text extraction should start, inclusive. + * + * @param aStartBookmark The starting bookmark. + */ + public void setStartBookmark(PDOutlineItem aStartBookmark) + { + startBookmark = aStartBookmark; + } + + /** + * This will tell if the text stripper should add some more text formatting. + * + * @return true if some more text formatting will be added + */ + public boolean getAddMoreFormatting() + { + return addMoreFormatting; + } + + /** + * There will some additional text formatting be added if addMoreFormatting is set to true. Default is false. + * + * @param newAddMoreFormatting Tell PDFBox to add some more text formatting + */ + public void setAddMoreFormatting(boolean newAddMoreFormatting) + { + addMoreFormatting = newAddMoreFormatting; + } + + /** + * This will tell if the text stripper should sort the text tokens before writing to the stream. + * + * @return true If the text tokens will be sorted before being written. + */ + public boolean getSortByPosition() + { + return sortByPosition; + } + + /** + * The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For + * example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and + * write out the normal text.
+ * The default is to not sort by position.
+ *
+ * A PDF writer could choose to write each character in a different order. By default PDFBox does not sort + * the text tokens before processing them due to performance reasons. + * + * @param newSortByPosition Tell PDFBox to sort the text positions. + */ + public void setSortByPosition(boolean newSortByPosition) + { + sortByPosition = newSortByPosition; + } + + /** + * Get the current space width-based tolerance value that is being used to estimate where spaces in text should be + * added. Note that the default value for this has been determined from trial and error. + * + * @return The current tolerance / scaling factor + */ + public float getSpacingTolerance() + { + return spacingTolerance; + } + + /** + * Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note + * that the default value for this has been determined from trial and error. Setting this value larger will reduce + * the number of spaces added. + * + * @param spacingToleranceValue tolerance / scaling factor to use + */ + public void setSpacingTolerance(float spacingToleranceValue) + { + spacingTolerance = spacingToleranceValue; + } + + /** + * Get the current character width-based tolerance value that is being used to estimate where spaces in text should + * be added. Note that the default value for this has been determined from trial and error. + * + * @return The current tolerance / scaling factor + */ + public float getAverageCharTolerance() + { + return averageCharTolerance; + } + + /** + * Set the character width-based tolerance value that is used to estimate where spaces in text should be added. Note + * that the default value for this has been determined from trial and error. Setting this value larger will reduce + * the number of spaces added. + * + * @param averageCharToleranceValue average tolerance / scaling factor to use + */ + public void setAverageCharTolerance(float averageCharToleranceValue) + { + averageCharTolerance = averageCharToleranceValue; + } + + /** + * returns the multiple of whitespace character widths for the current text which the current line start can be + * indented from the previous line start beyond which the current line start is considered to be a paragraph start. + * + * @return the number of whitespace character widths to use when detecting paragraph indents. + */ + public float getIndentThreshold() + { + return indentThreshold; + } + + /** + * sets the multiple of whitespace character widths for the current text which the current line start can be + * indented from the previous line start beyond which the current line start is considered to be a paragraph start. + * The default value is 2.0. + * + * @param indentThresholdValue the number of whitespace character widths to use when detecting paragraph indents. + */ + public void setIndentThreshold(float indentThresholdValue) + { + indentThreshold = indentThresholdValue; + } + + /** + * the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line + * start is considered to be a paragraph start. + * + * @return the character height multiple for max allowed whitespace between lines in the same paragraph. + */ + public float getDropThreshold() + { + return dropThreshold; + } + + /** + * sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current + * line start is considered to be a paragraph start. The default value is 2.5. + * + * @param dropThresholdValue the character height multiple for max allowed whitespace between lines in the same + * paragraph. + */ + public void setDropThreshold(float dropThresholdValue) + { + dropThreshold = dropThresholdValue; + } + + /** + * Returns the string which will be used at the beginning of a paragraph. + * + * @return the paragraph start string + */ + public String getParagraphStart() + { + return paragraphStart; + } + + /** + * Sets the string which will be used at the beginning of a paragraph. + * + * @param s the paragraph start string + */ + public void setParagraphStart(String s) + { + paragraphStart = s; + } + + /** + * Returns the string which will be used at the end of a paragraph. + * + * @return the paragraph end string + */ + public String getParagraphEnd() + { + return paragraphEnd; + } + + /** + * Sets the string which will be used at the end of a paragraph. + * + * @param s the paragraph end string + */ + public void setParagraphEnd(String s) + { + paragraphEnd = s; + } + + /** + * Returns the string which will be used at the beginning of a page. + * + * @return the page start string + */ + public String getPageStart() + { + return pageStart; + } + + /** + * Sets the string which will be used at the beginning of a page. + * + * @param pageStartValue the page start string + */ + public void setPageStart(String pageStartValue) + { + pageStart = pageStartValue; + } + + /** + * Returns the string which will be used at the end of a page. + * + * @return the page end string + */ + public String getPageEnd() + { + return pageEnd; + } + + /** + * Sets the string which will be used at the end of a page. + * + * @param pageEndValue the page end string + */ + public void setPageEnd(String pageEndValue) + { + pageEnd = pageEndValue; + } + + /** + * Returns the string which will be used at the beginning of an article. + * + * @return the article start string + */ + public String getArticleStart() + { + return articleStart; + } + + /** + * Sets the string which will be used at the beginning of an article. + * + * @param articleStartValue the article start string + */ + public void setArticleStart(String articleStartValue) + { + articleStart = articleStartValue; + } + + /** + * Returns the string which will be used at the end of an article. + * + * @return the article end string + */ + public String getArticleEnd() + { + return articleEnd; + } + + /** + * Sets the string which will be used at the end of an article. + * + * @param articleEndValue the article end string + */ + public void setArticleEnd(String articleEndValue) + { + articleEnd = articleEndValue; + } + + /** + * handles the line separator for a new line given the specified current and previous TextPositions. + * + * @param current the current text position + * @param lastPosition the previous text position + * @param lastLineStartPosition the last text position that followed a line separator. + * @param maxHeightForLine max height for positions since lastLineStartPosition + * @return start position of the last line + * @throws IOException if something went wrong + */ + private PositionWrapper handleLineSeparation(PositionWrapper current, + PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, + float maxHeightForLine) throws IOException + { + current.setLineStart(); + isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); + lastLineStartPosition = current; + if (current.isParagraphStart()) + { + if (lastPosition.isArticleStart()) + { + if (lastPosition.isLineStart()) + { + writeLineSeparator(); + } + writeParagraphStart(); + } + else + { + writeLineSeparator(); + writeParagraphSeparator(); + } + } + else + { + writeLineSeparator(); + } + return lastLineStartPosition; + } + + /** + * tests the relationship between the last text position, the current text position and the last text position that + * followed a line separator to decide if the gap represents a paragraph separation. This should only be + * called for consecutive text positions that first pass the line separation test. + *

+ * This base implementation tests to see if the lastLineStartPosition is null OR if the current vertical position + * has dropped below the last text vertical position by at least 2.5 times the current text height OR if the current + * horizontal position is indented by at least 2 times the current width of a space character. + *

+ *

+ * This also attempts to identify text that is indented under a hanging indent. + *

+ *

+ * This method sets the isParagraphStart and isHangingIndent flags on the current position object. + *

+ * + * @param position the current text position. This may have its isParagraphStart or isHangingIndent flags set upon + * return. + * @param lastPosition the previous text position (should not be null). + * @param lastLineStartPosition the last text position that followed a line separator, or null. + * @param maxHeightForLine max height for text positions since lasLineStartPosition. + */ + private void isParagraphSeparation(PositionWrapper position, PositionWrapper lastPosition, + PositionWrapper lastLineStartPosition, float maxHeightForLine) + { + boolean result = false; + if (lastLineStartPosition == null) + { + result = true; + } + else + { + float yGap = Math.abs(position.getTextPosition().getYDirAdj() + - lastPosition.getTextPosition().getYDirAdj()); + float newYVal = multiplyFloat(getDropThreshold(), maxHeightForLine); + // do we need to flip this for rtl? + float xGap = position.getTextPosition().getXDirAdj() + - lastLineStartPosition.getTextPosition().getXDirAdj(); + float newXVal = multiplyFloat(getIndentThreshold(), + position.getTextPosition().getWidthOfSpace()); + float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth()); + + if (yGap > newYVal) + { + result = true; + } + else if (xGap > newXVal) + { + // text is indented, but try to screen for hanging indent + if (!lastLineStartPosition.isParagraphStart()) + { + result = true; + } + else + { + position.setHangingIndent(); + } + } + else if (xGap < -position.getTextPosition().getWidthOfSpace()) + { + // text is left of previous line. Was it a hanging indent? + if (!lastLineStartPosition.isParagraphStart()) + { + result = true; + } + } + else if (Math.abs(xGap) < positionWidth) + { + // current horizontal position is within 1/4 a char of the last + // linestart. We'll treat them as lined up. + if (lastLineStartPosition.isHangingIndent()) + { + position.setHangingIndent(); + } + else if (lastLineStartPosition.isParagraphStart()) + { + // check to see if the previous line looks like + // any of a number of standard list item formats + Pattern liPattern = matchListItemPattern(lastLineStartPosition); + if (liPattern != null) + { + Pattern currentPattern = matchListItemPattern(position); + if (liPattern == currentPattern) + { + result = true; + } + } + } + } + } + if (result) + { + position.setParagraphStart(); + } + } + + private float multiplyFloat(float value1, float value2) + { + // multiply 2 floats and truncate the resulting value to 3 decimal places + // to avoid wrong results when comparing with another float + return Math.round(value1 * value2 * 1000) / 1000f; + } + + /** + * writes the paragraph separator string to the output. + * + * @throws IOException if something went wrong + */ + protected void writeParagraphSeparator() throws IOException + { + writeParagraphEnd(); + writeParagraphStart(); + } + + /** + * Write something (if defined) at the start of a paragraph. + * + * @throws IOException if something went wrong + */ + protected void writeParagraphStart() throws IOException + { + if (inParagraph) + { + writeParagraphEnd(); + inParagraph = false; + } + output.write(getParagraphStart()); + inParagraph = true; + } + + /** + * Write something (if defined) at the end of a paragraph. + * + * @throws IOException if something went wrong + */ + protected void writeParagraphEnd() throws IOException + { + if (!inParagraph) + { + writeParagraphStart(); + } + output.write(getParagraphEnd()); + inParagraph = false; + } + + /** + * Write something (if defined) at the start of a page. + * + * @throws IOException if something went wrong + */ + protected void writePageStart() throws IOException + { + output.write(getPageStart()); + } + + /** + * Write something (if defined) at the end of a page. + * + * @throws IOException if something went wrong + */ + protected void writePageEnd() throws IOException + { + output.write(getPageEnd()); + } + + /** + * returns the list item Pattern object that matches the text at the specified PositionWrapper or null if the text + * does not match such a pattern. The list of Patterns tested against is given by the {@link #getListItemPatterns()} + * method. To add to the list, simply override that method (if sub-classing) or explicitly supply your own list + * using {@link #setListItemPatterns(List)}. + * + * @param pw position + * @return the matching pattern + */ + private Pattern matchListItemPattern(PositionWrapper pw) + { + TextPosition tp = pw.getTextPosition(); + String txt = tp.getUnicode(); + return matchPattern(txt, getListItemPatterns()); + } + + /** + * a list of regular expressions that match commonly used list item formats, i.e. bullets, numbers, letters, Roman + * numerals, etc. Not meant to be comprehensive. + */ + private static final String[] LIST_ITEM_EXPRESSIONS = { "\\.", "\\d+\\.", "\\[\\d+\\]", + "\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)", "[a-z]\\)", "[IVXL]+\\.", + "[ivxl]+\\.", }; + + private List listOfPatterns = null; + + /** + * use to supply a different set of regular expression patterns for matching list item starts. + * + * @param patterns list of patterns + */ + protected void setListItemPatterns(List patterns) + { + listOfPatterns = patterns; + } + + /** + * returns a list of regular expression Patterns representing different common list item formats. For example + * numbered items of form: + *
    + *
  1. some text
  2. + *
  3. more text
  4. + *
+ * or + *
    + *
  • some text
  • + *
  • more text
  • + *
+ * etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...) or "\[\\d+\]" + * (matches "[1]", "[2]", ...). + *

+ * This method returns a list of such regular expression Patterns. + * + * @return a list of Pattern objects. + */ + protected List getListItemPatterns() + { + if (listOfPatterns == null) + { + listOfPatterns = new ArrayList(); + for (String expression : LIST_ITEM_EXPRESSIONS) + { + Pattern p = Pattern.compile(expression); + listOfPatterns.add(p); + } + } + return listOfPatterns; + } + + /** + * iterates over the specified list of Patterns until it finds one that matches the specified string. Then returns + * the Pattern. + *

+ * Order of the supplied list of patterns is important as most common patterns should come first. Patterns should be + * strict in general, and all will be used with case sensitivity on. + *

+ * + * @param string the string to be searched + * @param patterns list of patterns + * @return matching pattern + */ + protected static Pattern matchPattern(String string, List patterns) + { + for (Pattern p : patterns) + { + if (p.matcher(string).matches()) + { + return p; + } + } + return null; + } + + /** + * Write a list of string containing a whole line of a document. + * + * @param line a list with the words of the given line + * @throws IOException if something went wrong + */ + private void writeLine(List line) + throws IOException + { + int numberOfStrings = line.size(); + for (int i = 0; i < numberOfStrings; i++) + { + WordWithTextPositions word = line.get(i); + writeString(word.getText(), word.getTextPositions()); + if (i < numberOfStrings - 1) + { + writeWordSeparator(); + } + } + } + + /** + * Normalize the given list of TextPositions. + * + * @param line list of TextPositions + * @return a list of strings, one string for every word + */ + private List normalize(List line) + { + List normalized = new LinkedList(); + StringBuilder lineBuilder = new StringBuilder(); + List wordPositions = new ArrayList(); + + for (LineItem item : line) + { + lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item); + } + + if (lineBuilder.length() > 0) + { + normalized.add(createWord(lineBuilder.toString(), wordPositions)); + } + return normalized; + } + + /** + * Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given + * word. If the word is a full line, the results will be the best. If the word contains of single words or + * characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and + * characters! + * + * Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx + * + * @param word The word that shall be processed + * @return new word with the correct direction of the containing characters + */ + private String handleDirection(String word) + { + Bidi bidi = new Bidi(word, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); + + // if there is pure LTR text no need to process further + if (!bidi.isMixed() && bidi.getBaseLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT) + { + return word; + } + + // collect individual bidi information + int runCount = bidi.getRunCount(); + byte[] levels = new byte[runCount]; + Integer[] runs = new Integer[runCount]; + + for (int i = 0; i < runCount; i++) + { + levels[i] = (byte)bidi.getRunLevel(i); + runs[i] = i; + } + + // reorder individual parts based on their levels + Bidi.reorderVisually(levels, 0, runs, 0, runCount); + + // collect the parts based on the direction within the run + StringBuilder result = new StringBuilder(); + + for (int i = 0; i < runCount; i++) + { + int index = runs[i]; + int start = bidi.getRunStart(index); + int end = bidi.getRunLimit(index); + + int level = levels[index]; + + if ((level & 1) != 0) + { + while (--end >= start) + { + char character = word.charAt(end); + if (Character.isMirrored(word.codePointAt(end))) + { + if (MIRRORING_CHAR_MAP.containsKey(character)) + { + result.append(MIRRORING_CHAR_MAP.get(character)); + } + else + { + result.append(character); + } + } + else + { + result.append(character); + } + } + } + else + { + result.append(word, start, end); + } + } + + return result.toString(); + } + + private static Map MIRRORING_CHAR_MAP = new HashMap(); + + static + { + String path = "/org/apache/pdfbox/resources/text/BidiMirroring.txt"; + InputStream input = new BufferedInputStream(PDFTextStripper.class.getResourceAsStream(path)); + try + { + parseBidiFile(input); + } + catch (IOException e) + { + LOG.warn("Could not parse BidiMirroring.txt, mirroring char map will be empty: " + + e.getMessage()); + } + finally + { + try + { + input.close(); + } + catch (IOException e) + { + LOG.error("Could not close BidiMirroring.txt ", e); + } + } + } + + /** + * This method parses the bidi file provided as inputstream. + * + * @param inputStream - The bidi file as inputstream + * @throws IOException if any line could not be read by the LineNumberReader + */ + private static void parseBidiFile(InputStream inputStream) throws IOException + { + LineNumberReader rd = new LineNumberReader(new InputStreamReader(inputStream)); + + do + { + String s = rd.readLine(); + if (s == null) + { + break; + } + + int comment = s.indexOf('#'); // ignore comments + if (comment != -1) + { + s = s.substring(0, comment); + } + + if (s.length() < 2) + { + continue; + } + + StringTokenizer st = new StringTokenizer(s, ";"); + int nFields = st.countTokens(); + Character[] fields = new Character[nFields]; + for (int i = 0; i < nFields; i++) + { + fields[i] = (char) Integer.parseInt(st.nextToken().trim(), 16); + } + + if (fields.length == 2) + { + // initialize the MIRRORING_CHAR_MAP + MIRRORING_CHAR_MAP.put(fields[0], fields[1]); + } + + } while (true); + } + + /** + * Used within {@link #normalize(List)} to create a single {@link WordWithTextPositions} entry. + */ + private WordWithTextPositions createWord(String word, List wordPositions) + { + return new WordWithTextPositions(normalizeWord(word), wordPositions); + } + + /** + * Normalize certain Unicode characters. For example, convert the single "fi" ligature to "f" and "i". Also + * normalises Arabic and Hebrew presentation forms. + * + * @param word Word to normalize + * @return Normalized word + */ + private String normalizeWord(String word) + { + StringBuilder builder = null; + int p = 0; + int q = 0; + int strLength = word.length(); + for (; q < strLength; q++) + { + // We only normalize if the codepoint is in a given range. + // Otherwise, NFKC converts too many things that would cause + // confusion. For example, it converts the micro symbol in + // extended Latin to the value in the Greek script. We normalize + // the Unicode Alphabetic and Arabic A&B Presentation forms. + char c = word.charAt(q); + if (0xFB00 <= c && c <= 0xFDFF || 0xFE70 <= c && c <= 0xFEFF) + { + if (builder == null) + { + builder = new StringBuilder(strLength * 2); + } + builder.append(word, p, q); + // Some fonts map U+FDF2 differently than the Unicode spec. + // They add an extra U+0627 character to compensate. + // This removes the extra character for those fonts. + if (c == 0xFDF2 && q > 0 + && (word.charAt(q - 1) == 0x0627 || word.charAt(q - 1) == 0xFE8D)) + { + builder.append("\u0644\u0644\u0647"); + } + else + { + // Trim because some decompositions have an extra space, such as U+FC5E + builder.append(Normalizer + .normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim()); + } + p = q + 1; + } + } + if (builder == null) + { + return handleDirection(word); + } + else + { + builder.append(word, p, q); + return handleDirection(builder.toString()); + } + } + + /** + * Used within {@link #normalize(List)} to handle a {@link TextPosition}. + * + * @return The StringBuilder that must be used when calling this method. + */ + private StringBuilder normalizeAdd(List normalized, + StringBuilder lineBuilder, List wordPositions, LineItem item) + { + if (item.isWordSeparator()) + { + normalized.add( + createWord(lineBuilder.toString(), new ArrayList(wordPositions))); + lineBuilder = new StringBuilder(); + wordPositions.clear(); + } + else + { + TextPosition text = item.getTextPosition(); + lineBuilder.append(text.getUnicode()); + wordPositions.add(text); + } + return lineBuilder; + } + + /** + * internal marker class. Used as a place holder in a line of TextPositions. + */ + private static final class LineItem + { + public static LineItem WORD_SEPARATOR = new LineItem(); + + public static LineItem getWordSeparator() + { + return WORD_SEPARATOR; + } + + private final TextPosition textPosition; + + private LineItem() + { + textPosition = null; + } + + LineItem(TextPosition textPosition) + { + this.textPosition = textPosition; + } + + public TextPosition getTextPosition() + { + return textPosition; + } + + public boolean isWordSeparator() + { + return textPosition == null; + } + } + + /** + * Internal class that maps strings to lists of {@link TextPosition} arrays. Note that the number of entries in that + * list may differ from the number of characters in the string due to normalization. + * + * @author Axel Dörfler + */ + private static final class WordWithTextPositions + { + String text; + List textPositions; + + WordWithTextPositions(String word, List positions) + { + text = word; + textPositions = positions; + } + + public String getText() + { + return text; + } + + public List getTextPositions() + { + return textPositions; + } + } + + /** + * wrapper of TextPosition that adds flags to track status as linestart and paragraph start positions. + *

+ * This is implemented as a wrapper since the TextPosition class doesn't provide complete access to its state fields + * to subclasses. Also, conceptually TextPosition is immutable while these flags need to be set post-creation so it + * makes sense to put these flags in this separate class. + *

+ * + * @author m.martinez@ll.mit.edu + */ + private static final class PositionWrapper + { + private boolean isLineStart = false; + private boolean isParagraphStart = false; + private boolean isPageBreak = false; + private boolean isHangingIndent = false; + private boolean isArticleStart = false; + + private TextPosition position = null; + + /** + * Constructs a PositionWrapper around the specified TextPosition object. + * + * @param position the text position. + */ + PositionWrapper(TextPosition position) + { + this.position = position; + } + + /** + * Returns the underlying TextPosition object. + * + * @return the text position + */ + public TextPosition getTextPosition() + { + return position; + } + + public boolean isLineStart() + { + return isLineStart; + } + + /** + * Sets the isLineStart() flag to true. + */ + public void setLineStart() + { + this.isLineStart = true; + } + + public boolean isParagraphStart() + { + return isParagraphStart; + } + + /** + * sets the isParagraphStart() flag to true. + */ + public void setParagraphStart() + { + this.isParagraphStart = true; + } + + public boolean isArticleStart() + { + return isArticleStart; + } + + /** + * Sets the isArticleStart() flag to true. + */ + public void setArticleStart() + { + this.isArticleStart = true; + } + + public boolean isPageBreak() + { + return isPageBreak; + } + + /** + * Sets the isPageBreak() flag to true. + */ + public void setPageBreak() + { + this.isPageBreak = true; + } + + public boolean isHangingIndent() + { + return isHangingIndent; + } + + /** + * Sets the isHangingIndent() flag to true. + */ + public void setHangingIndent() + { + this.isHangingIndent = true; + } + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 41595cc4..1cb6e780 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -899,7 +899,7 @@ public class RedactionIntegrationTest { @Test public void redactionTest() throws IOException { - String fileName = "files/new/VV-511309.pdf"; + String fileName = "files/new/S416.pdf"; String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf"; long start = System.currentTimeMillis(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/S416.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/S416.pdf new file mode 100644 index 00000000..fb043424 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/S416.pdf differ