RED-10127: fix typos and extend dot comparison

This commit is contained in:
Kilian Schuettler 2024-10-09 13:14:29 +02:00
parent 4b0c041d84
commit e7dbda813a
4 changed files with 36 additions and 16 deletions

View File

@ -33,8 +33,6 @@ public class TextPageBlock extends AbstractPageBlock {
private boolean underlined;
private double highestFontSize;
private PageBlockType classification;
private boolean toDuplicate;
@ -262,6 +260,12 @@ public class TextPageBlock extends AbstractPageBlock {
}
public double getHighestFontSize() {
return frequencyCounters.getFontSizeFrequencyCounter().getHighest();
}
@Override
public boolean isEmpty() {

View File

@ -57,11 +57,13 @@ public class TableOfContentsClassificationService {
continue;
}
int offset = identifyTOCItems(i + 1, textBlocks, document);
int end = identifyTOCItems(i + 1, textBlocks, document);
if (offset > 1) {
textBlock.textBlock().setClassification(PageBlockType.H1);
i += offset;
if (end > i + 1) {
if (textBlock.textBlock().getClassification() == null) {
textBlock.textBlock().setClassification(PageBlockType.H1);
}
i = end;
}
}
}
@ -352,7 +354,7 @@ public class TableOfContentsClassificationService {
return false;
}
int prev = getNumberAsInt(numbers, i);
int prev = getNumberAsInt(numbers, i - 1);
int curr = getNumberAsInt(numbers, i);
int next = getNumberAsInt(numbers, i + 1);

View File

@ -5,6 +5,7 @@ import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
@ -50,6 +51,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
private final static Set<String> DOT_LIKE_CHARACTERS = Set.of(".", "·", "", "", "", "", "", "", "", "", "", "", "", "");
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
private final List<Ruling> rulings = new ArrayList<>();
private final List<Ruling> graphicsPath = new ArrayList<>();
@ -336,20 +338,32 @@ public class PDFLinesTextStripper extends PDFTextStripper {
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
return i - startIndex >= 4 //
&& textPositions.get(i).getUnicode().equals(".") //
&& textPositions.get(i - 1).getUnicode().equals(".") //
&& textPositions.get(i - 2).getUnicode().equals(".") //
&& !textPositions.get(i - 3).getUnicode().equals(".");
&& isDot(textPositions, i) //
&& isDot(textPositions, i - 1) //
&& isDot(textPositions, i - 2) //
&& alphanumeric(textPositions, i - 3);
}
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
return i - startIndex >= 4 //
&& !textPositions.get(i).getUnicode().equals(".") //
&& textPositions.get(i - 1).getUnicode().equals(".") //
&& textPositions.get(i - 2).getUnicode().equals(".") //
&& textPositions.get(i - 3).getUnicode().equals(".");
&& alphanumeric(textPositions, i) //
&& isDot(textPositions, i - 1) //
&& isDot(textPositions, i - 2) //
&& isDot(textPositions, i - 3);
}
private static boolean isDot(List<TextPosition> textPositions, int i) {
return DOT_LIKE_CHARACTERS.contains(textPositions.get(i).getUnicode());
}
private static boolean alphanumeric(List<TextPosition> textPositions, int i) {
return Character.isAlphabetic(textPositions.get(i).getUnicode().charAt(0)) || Character.isDigit(textPositions.get(i).getUnicode().charAt(0));
}

View File

@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Disabled
public void testLayoutParserEndToEnd() {
String filePath = "/home/kschuettler/Dokumente/TestFiles/NER Dataset/Syngenta prod/77c680315c31d403d2e023be023b2087.PREVIEW.pdf";
String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/613c12dd5c14851cb37e413eb56a7a7b.UNTOUCHED.pdf";
runForFile(filePath);
}