RED-9976: Removed sorting that scrambles text in PDFTextStripper
This commit is contained in:
parent
c726a643f0
commit
fec19f4afb
@ -25,10 +25,22 @@ import java.io.StringWriter;
|
|||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
import java.text.Bidi;
|
import java.text.Bidi;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.ArrayDeque;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Deque;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.SortedMap;
|
||||||
|
import java.util.SortedSet;
|
||||||
|
import java.util.StringTokenizer;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import java.util.TreeSet;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.pdfbox.cos.COSDictionary;
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
@ -46,6 +58,8 @@ import org.apache.pdfbox.text.TextPositionComparator;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
|
* This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
|
||||||
* see S416.pdf
|
* see S416.pdf
|
||||||
@ -194,40 +208,33 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void beginMarkedContentSequence(COSName tag, COSDictionary properties) {
|
||||||
|
|
||||||
public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
|
|
||||||
{
|
|
||||||
PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
|
PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
|
||||||
if (this.currentMarkedContents.isEmpty())
|
if (this.currentMarkedContents.isEmpty()) {
|
||||||
{
|
|
||||||
this.markedContents.add(markedContent);
|
this.markedContents.add(markedContent);
|
||||||
}
|
} else {
|
||||||
else
|
PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek();
|
||||||
{
|
if (currentMarkedContent != null) {
|
||||||
PDMarkedContent currentMarkedContent =
|
|
||||||
this.currentMarkedContents.peek();
|
|
||||||
if (currentMarkedContent != null)
|
|
||||||
{
|
|
||||||
currentMarkedContent.addMarkedContent(markedContent);
|
currentMarkedContent.addMarkedContent(markedContent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.currentMarkedContents.push(markedContent);
|
this.currentMarkedContents.push(markedContent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void endMarkedContentSequence()
|
public void endMarkedContentSequence() {
|
||||||
{
|
|
||||||
if (!this.currentMarkedContents.isEmpty())
|
if (!this.currentMarkedContents.isEmpty()) {
|
||||||
{
|
|
||||||
this.currentMarkedContents.pop();
|
this.currentMarkedContents.pop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void xobject(PDXObject xobject)
|
public void xobject(PDXObject xobject) {
|
||||||
{
|
|
||||||
if (!this.currentMarkedContents.isEmpty())
|
if (!this.currentMarkedContents.isEmpty()) {
|
||||||
{
|
|
||||||
this.currentMarkedContents.peek().addXObject(xobject);
|
this.currentMarkedContents.peek().addXObject(xobject);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -313,7 +320,11 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
endBookmarkPageNumber = -1;
|
endBookmarkPageNumber = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
|
if (startBookmarkPageNumber == -1
|
||||||
|
&& startBookmark != null
|
||||||
|
&& endBookmarkPageNumber == -1
|
||||||
|
&& endBookmark != null
|
||||||
|
&& startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
|
||||||
// this is a special case where both the start and end bookmark
|
// this is a special case where both the start and end bookmark
|
||||||
// are the same but point to nothing. In this case
|
// are the same but point to nothing. In this case
|
||||||
// we will not extract any text.
|
// we will not extract any text.
|
||||||
@ -360,7 +371,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
@Override
|
@Override
|
||||||
public void processPage(PDPage page) throws IOException {
|
public void processPage(PDPage page) throws IOException {
|
||||||
|
|
||||||
if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
|
if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1
|
||||||
|
|| currentPageNo
|
||||||
|
<= endBookmarkPageNumber)) {
|
||||||
startPage(page);
|
startPage(page);
|
||||||
|
|
||||||
int numberOfArticleSections = 1;
|
int numberOfArticleSections = 1;
|
||||||
@ -635,7 +648,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
var normalized = normalize(line);
|
var normalized = normalize(line);
|
||||||
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
|
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
|
||||||
|
|
||||||
|
|
||||||
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
|
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
|
||||||
writeLine(normalized, current.isParagraphStart);
|
writeLine(normalized, current.isParagraphStart);
|
||||||
line.clear();
|
line.clear();
|
||||||
@ -647,8 +659,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
}
|
}
|
||||||
// test if our TextPosition starts after a new word would be expected to start
|
// test if our TextPosition starts after a new word would be expected to start
|
||||||
if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX
|
if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX
|
||||||
// only bother adding a word separator if the last character was not a word separator
|
// only bother adding a word separator if the last character was not a word separator
|
||||||
&& (wordSeparator.isEmpty() || //
|
&& (wordSeparator.isEmpty() || //
|
||||||
(lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(wordSeparator)))) {
|
(lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(wordSeparator)))) {
|
||||||
line.add(LineItem.getWordSeparator());
|
line.add(LineItem.getWordSeparator());
|
||||||
}
|
}
|
||||||
@ -914,8 +926,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
textList.add(text);
|
textList.add(text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!this.currentMarkedContents.isEmpty())
|
if (!this.currentMarkedContents.isEmpty()) {
|
||||||
{
|
|
||||||
this.currentMarkedContents.peek().addText(text);
|
this.currentMarkedContents.peek().addText(text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1711,7 +1722,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
int numberOfStrings = line.size();
|
int numberOfStrings = line.size();
|
||||||
for (int i = 0; i < numberOfStrings; i++) {
|
for (int i = 0; i < numberOfStrings; i++) {
|
||||||
WordWithTextPositions word = line.get(i);
|
WordWithTextPositions word = line.get(i);
|
||||||
word.getTextPositions().sort(Comparator.comparing(TextPosition::getXDirAdj));
|
|
||||||
writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
|
writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
|
||||||
if (i < numberOfStrings - 1) {
|
if (i < numberOfStrings - 1) {
|
||||||
writeWordSeparator();
|
writeWordSeparator();
|
||||||
@ -2102,7 +2112,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
return endParagraphWritten;
|
return endParagraphWritten;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setEndParagraphWritten(){
|
|
||||||
|
public void setEndParagraphWritten() {
|
||||||
|
|
||||||
endParagraphWritten = true;
|
endParagraphWritten = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2145,7 +2157,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
this.isHangingIndent = true;
|
this.isHangingIndent = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user