RED-9976: Removed sorting that scrambles text in PDFTextStripper

2024-09-10 12:48:28 +02:00 · 2024-09-10 12:48:28 +02:00 · fec19f4afb
commit fec19f4afb
parent c726a643f0
1 changed files with 42 additions and 31 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java
@ -25,10 +25,22 @@ import java.io.StringWriter;
 import java.io.Writer;
 import java.text.Bidi;
 import java.text.Normalizer;
-import java.util.*;
+import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Deque;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.StringTokenizer;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.regex.Pattern;
 import lombok.Getter;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.cos.COSDictionary;
@ -46,6 +58,8 @@ import org.apache.pdfbox.text.TextPositionComparator;
 import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
 import lombok.Getter;
 /**
 * This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
 * see S416.pdf
@ -194,40 +208,33 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
    }
    public void beginMarkedContentSequence(COSName tag, COSDictionary properties) {
    public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
    {
        PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
-        if (this.currentMarkedContents.isEmpty())
+        if (this.currentMarkedContents.isEmpty()) {
        {
            this.markedContents.add(markedContent);
-        }
+        } else {
-        else
+            PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek();
-        {
+            if (currentMarkedContent != null) {
            PDMarkedContent currentMarkedContent =
                    this.currentMarkedContents.peek();
            if (currentMarkedContent != null)
            {
                currentMarkedContent.addMarkedContent(markedContent);
            }
        }
        this.currentMarkedContents.push(markedContent);
    }
    @Override
-    public void endMarkedContentSequence()
+    public void endMarkedContentSequence() {
-    {
+
-        if (!this.currentMarkedContents.isEmpty())
+        if (!this.currentMarkedContents.isEmpty()) {
        {
            this.currentMarkedContents.pop();
        }
    }
-    public void xobject(PDXObject xobject)
+    public void xobject(PDXObject xobject) {
-    {
+
-        if (!this.currentMarkedContents.isEmpty())
+        if (!this.currentMarkedContents.isEmpty()) {
        {
            this.currentMarkedContents.peek().addXObject(xobject);
        }
    }
@ -313,7 +320,11 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
            endBookmarkPageNumber = -1;
        }
-        if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
+        if (startBookmarkPageNumber == -1
            && startBookmark != null
            && endBookmarkPageNumber == -1
            && endBookmark != null
            && startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
            // this is a special case where both the start and end bookmark
            // are the same but point to nothing. In this case
            // we will not extract any text.
@ -360,7 +371,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
    @Override
    public void processPage(PDPage page) throws IOException {
-        if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
+        if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1
                                                                                                                                                      || currentPageNo
                                                                                                                                                         <= endBookmarkPageNumber)) {
            startPage(page);
            int numberOfArticleSections = 1;
@ -635,7 +648,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
                        var normalized = normalize(line);
 //                        normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
                        lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
                        writeLine(normalized, current.isParagraphStart);
                        line.clear();
@ -647,8 +659,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
                    }
                    // test if our TextPosition starts after a new word would be expected to start
                    if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX
-                            // only bother adding a word separator if the last character was not a word separator
+                        // only bother adding a word separator if the last character was not a word separator
-                            && (wordSeparator.isEmpty() || //
+                        && (wordSeparator.isEmpty() || //
                            (lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(wordSeparator)))) {
                        line.add(LineItem.getWordSeparator());
                    }
@ -914,8 +926,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
                    textList.add(text);
                }
            }
-            if (!this.currentMarkedContents.isEmpty())
+            if (!this.currentMarkedContents.isEmpty()) {
            {
                this.currentMarkedContents.peek().addText(text);
            }
        }
@ -1711,7 +1722,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
        int numberOfStrings = line.size();
        for (int i = 0; i < numberOfStrings; i++) {
            WordWithTextPositions word = line.get(i);
            word.getTextPositions().sort(Comparator.comparing(TextPosition::getXDirAdj));
            writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
            if (i < numberOfStrings - 1) {
                writeWordSeparator();
@ -2102,7 +2112,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
            return endParagraphWritten;
        }
-        public void setEndParagraphWritten(){
+
        public void setEndParagraphWritten() {
            endParagraphWritten = true;
        }
@ -2145,7 +2157,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
            this.isHangingIndent = true;
        }
    }
 }