diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java index 83fafea..45f142b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java @@ -25,10 +25,22 @@ import java.io.StringWriter; import java.io.Writer; import java.text.Bidi; import java.text.Normalizer; -import java.util.*; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Deque; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.StringTokenizer; +import java.util.TreeMap; +import java.util.TreeSet; import java.util.regex.Pattern; -import lombok.Getter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSDictionary; @@ -46,6 +58,8 @@ import org.apache.pdfbox.text.TextPositionComparator; import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort; +import lombok.Getter; + /** * This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox. * see S416.pdf @@ -194,40 +208,33 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } + public void beginMarkedContentSequence(COSName tag, COSDictionary properties) { - public void beginMarkedContentSequence(COSName tag, COSDictionary properties) - { PDMarkedContent markedContent = PDMarkedContent.create(tag, properties); - if (this.currentMarkedContents.isEmpty()) - { + if (this.currentMarkedContents.isEmpty()) { this.markedContents.add(markedContent); - } - else - { - PDMarkedContent currentMarkedContent = - this.currentMarkedContents.peek(); - if (currentMarkedContent != null) - { + } else { + PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek(); + if (currentMarkedContent != null) { currentMarkedContent.addMarkedContent(markedContent); } } this.currentMarkedContents.push(markedContent); } + @Override - public void endMarkedContentSequence() - { - if (!this.currentMarkedContents.isEmpty()) - { + public void endMarkedContentSequence() { + + if (!this.currentMarkedContents.isEmpty()) { this.currentMarkedContents.pop(); } } - public void xobject(PDXObject xobject) - { - if (!this.currentMarkedContents.isEmpty()) - { + public void xobject(PDXObject xobject) { + + if (!this.currentMarkedContents.isEmpty()) { this.currentMarkedContents.peek().addXObject(xobject); } } @@ -313,7 +320,11 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { endBookmarkPageNumber = -1; } - if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) { + if (startBookmarkPageNumber == -1 + && startBookmark != null + && endBookmarkPageNumber == -1 + && endBookmark != null + && startBookmark.getCOSObject() == endBookmark.getCOSObject()) { // this is a special case where both the start and end bookmark // are the same but point to nothing. In this case // we will not extract any text. @@ -360,7 +371,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { @Override public void processPage(PDPage page) throws IOException { - if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) { + if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 + || currentPageNo + <= endBookmarkPageNumber)) { startPage(page); int numberOfArticleSections = 1; @@ -635,7 +648,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { var normalized = normalize(line); // normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent() - lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); writeLine(normalized, current.isParagraphStart); line.clear(); @@ -647,8 +659,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } // test if our TextPosition starts after a new word would be expected to start if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX - // only bother adding a word separator if the last character was not a word separator - && (wordSeparator.isEmpty() || // + // only bother adding a word separator if the last character was not a word separator + && (wordSeparator.isEmpty() || // (lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(wordSeparator)))) { line.add(LineItem.getWordSeparator()); } @@ -914,8 +926,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { textList.add(text); } } - if (!this.currentMarkedContents.isEmpty()) - { + if (!this.currentMarkedContents.isEmpty()) { this.currentMarkedContents.peek().addText(text); } } @@ -1711,7 +1722,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { int numberOfStrings = line.size(); for (int i = 0; i < numberOfStrings; i++) { WordWithTextPositions word = line.get(i); - word.getTextPositions().sort(Comparator.comparing(TextPosition::getXDirAdj)); writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1); if (i < numberOfStrings - 1) { writeWordSeparator(); @@ -2102,7 +2112,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { return endParagraphWritten; } - public void setEndParagraphWritten(){ + + public void setEndParagraphWritten() { + endParagraphWritten = true; } @@ -2145,7 +2157,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { this.isHangingIndent = true; } - } }