diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 7ddeb1e..9b7940b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -33,8 +33,6 @@ public class TextPageBlock extends AbstractPageBlock { private boolean underlined; - private double highestFontSize; - private PageBlockType classification; private boolean toDuplicate; @@ -262,6 +260,12 @@ public class TextPageBlock extends AbstractPageBlock { } + public double getHighestFontSize() { + + return frequencyCounters.getFontSizeFrequencyCounter().getHighest(); + } + + @Override public boolean isEmpty() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java index 854d087..0bce2cd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java @@ -57,11 +57,13 @@ public class TableOfContentsClassificationService { continue; } - int offset = identifyTOCItems(i + 1, textBlocks, document); + int end = identifyTOCItems(i + 1, textBlocks, document); - if (offset > 1) { - textBlock.textBlock().setClassification(PageBlockType.H1); - i += offset; + if (end > i + 1) { + if (textBlock.textBlock().getClassification() == null) { + textBlock.textBlock().setClassification(PageBlockType.H1); + } + i = end; } } } @@ -352,7 +354,7 @@ public class TableOfContentsClassificationService { return false; } - int prev = getNumberAsInt(numbers, i); + int prev = getNumberAsInt(numbers, i - 1); int curr = getNumberAsInt(numbers, i); int next = getNumberAsInt(numbers, i + 1); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index e39b666..c125acd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -5,6 +5,7 @@ import java.awt.geom.Point2D; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Set; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; @@ -50,6 +51,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class PDFLinesTextStripper extends PDFTextStripper { + private final static Set DOT_LIKE_CHARACTERS = Set.of(".", "·", "•", "․", "‧", "∙", "⋅", "・", ".", "・", "…", "⸱", "﹒", "ꞏ"); private final List textPositionSequences = new ArrayList<>(); private final List rulings = new ArrayList<>(); private final List graphicsPath = new ArrayList<>(); @@ -336,20 +338,32 @@ public class PDFLinesTextStripper extends PDFTextStripper { private boolean isWordFollowedByDottedLine(List textPositions, int i, int startIndex) { return i - startIndex >= 4 // - && textPositions.get(i).getUnicode().equals(".") // - && textPositions.get(i - 1).getUnicode().equals(".") // - && textPositions.get(i - 2).getUnicode().equals(".") // - && !textPositions.get(i - 3).getUnicode().equals("."); + && isDot(textPositions, i) // + && isDot(textPositions, i - 1) // + && isDot(textPositions, i - 2) // + && alphanumeric(textPositions, i - 3); } private static boolean isDottedLineFollowedByWord(List textPositions, int i, int startIndex) { return i - startIndex >= 4 // - && !textPositions.get(i).getUnicode().equals(".") // - && textPositions.get(i - 1).getUnicode().equals(".") // - && textPositions.get(i - 2).getUnicode().equals(".") // - && textPositions.get(i - 3).getUnicode().equals("."); + && alphanumeric(textPositions, i) // + && isDot(textPositions, i - 1) // + && isDot(textPositions, i - 2) // + && isDot(textPositions, i - 3); + } + + + private static boolean isDot(List textPositions, int i) { + + return DOT_LIKE_CHARACTERS.contains(textPositions.get(i).getUnicode()); + } + + + private static boolean alphanumeric(List textPositions, int i) { + + return Character.isAlphabetic(textPositions.get(i).getUnicode().charAt(0)) || Character.isDigit(textPositions.get(i).getUnicode().charAt(0)); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 4d9dd72..3a68b37 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Disabled public void testLayoutParserEndToEnd() { - String filePath = "/home/kschuettler/Dokumente/TestFiles/NER Dataset/Syngenta prod/77c680315c31d403d2e023be023b2087.PREVIEW.pdf"; + String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/613c12dd5c14851cb37e413eb56a7a7b.UNTOUCHED.pdf"; runForFile(filePath); }