RED-9976: Removed sorting that scrambles text in PDFTextStripper

This commit is contained in:
Dominique Eifländer 2024-09-10 12:48:28 +02:00
parent c726a643f0
commit fec19f4afb

View File

@ -25,10 +25,22 @@ import java.io.StringWriter;
import java.io.Writer; import java.io.Writer;
import java.text.Bidi; import java.text.Bidi;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import lombok.Getter;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSDictionary;
@ -46,6 +58,8 @@ import org.apache.pdfbox.text.TextPositionComparator;
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort; import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
import lombok.Getter;
/** /**
* This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox. * This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
* see S416.pdf * see S416.pdf
@ -194,40 +208,33 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
} }
public void beginMarkedContentSequence(COSName tag, COSDictionary properties) {
public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
{
PDMarkedContent markedContent = PDMarkedContent.create(tag, properties); PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
if (this.currentMarkedContents.isEmpty()) if (this.currentMarkedContents.isEmpty()) {
{
this.markedContents.add(markedContent); this.markedContents.add(markedContent);
} } else {
else PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek();
{ if (currentMarkedContent != null) {
PDMarkedContent currentMarkedContent =
this.currentMarkedContents.peek();
if (currentMarkedContent != null)
{
currentMarkedContent.addMarkedContent(markedContent); currentMarkedContent.addMarkedContent(markedContent);
} }
} }
this.currentMarkedContents.push(markedContent); this.currentMarkedContents.push(markedContent);
} }
@Override @Override
public void endMarkedContentSequence() public void endMarkedContentSequence() {
{
if (!this.currentMarkedContents.isEmpty()) if (!this.currentMarkedContents.isEmpty()) {
{
this.currentMarkedContents.pop(); this.currentMarkedContents.pop();
} }
} }
public void xobject(PDXObject xobject) public void xobject(PDXObject xobject) {
{
if (!this.currentMarkedContents.isEmpty()) if (!this.currentMarkedContents.isEmpty()) {
{
this.currentMarkedContents.peek().addXObject(xobject); this.currentMarkedContents.peek().addXObject(xobject);
} }
} }
@ -313,7 +320,11 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
endBookmarkPageNumber = -1; endBookmarkPageNumber = -1;
} }
if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) { if (startBookmarkPageNumber == -1
&& startBookmark != null
&& endBookmarkPageNumber == -1
&& endBookmark != null
&& startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
// this is a special case where both the start and end bookmark // this is a special case where both the start and end bookmark
// are the same but point to nothing. In this case // are the same but point to nothing. In this case
// we will not extract any text. // we will not extract any text.
@ -360,7 +371,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
@Override @Override
public void processPage(PDPage page) throws IOException { public void processPage(PDPage page) throws IOException {
if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) { if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1
|| currentPageNo
<= endBookmarkPageNumber)) {
startPage(page); startPage(page);
int numberOfArticleSections = 1; int numberOfArticleSections = 1;
@ -635,7 +648,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
var normalized = normalize(line); var normalized = normalize(line);
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent() // normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
writeLine(normalized, current.isParagraphStart); writeLine(normalized, current.isParagraphStart);
line.clear(); line.clear();
@ -647,8 +659,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
} }
// test if our TextPosition starts after a new word would be expected to start // test if our TextPosition starts after a new word would be expected to start
if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX
// only bother adding a word separator if the last character was not a word separator // only bother adding a word separator if the last character was not a word separator
&& (wordSeparator.isEmpty() || // && (wordSeparator.isEmpty() || //
(lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(wordSeparator)))) { (lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(wordSeparator)))) {
line.add(LineItem.getWordSeparator()); line.add(LineItem.getWordSeparator());
} }
@ -914,8 +926,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
textList.add(text); textList.add(text);
} }
} }
if (!this.currentMarkedContents.isEmpty()) if (!this.currentMarkedContents.isEmpty()) {
{
this.currentMarkedContents.peek().addText(text); this.currentMarkedContents.peek().addText(text);
} }
} }
@ -1711,7 +1722,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
int numberOfStrings = line.size(); int numberOfStrings = line.size();
for (int i = 0; i < numberOfStrings; i++) { for (int i = 0; i < numberOfStrings; i++) {
WordWithTextPositions word = line.get(i); WordWithTextPositions word = line.get(i);
word.getTextPositions().sort(Comparator.comparing(TextPosition::getXDirAdj));
writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1); writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
if (i < numberOfStrings - 1) { if (i < numberOfStrings - 1) {
writeWordSeparator(); writeWordSeparator();
@ -2102,7 +2112,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
return endParagraphWritten; return endParagraphWritten;
} }
public void setEndParagraphWritten(){
public void setEndParagraphWritten() {
endParagraphWritten = true; endParagraphWritten = true;
} }
@ -2145,7 +2157,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
this.isHangingIndent = true; this.isHangingIndent = true;
} }
} }
} }