From ce41014d4b2dc1aa660fb1b836d2daefb445526c Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Fri, 8 Nov 2024 12:15:56 +0100 Subject: [PATCH] RED-9139: more robust TOC detection * detect numbers in words, and not just whole words that are numbers --- ...mparator.java => TocNumberComparator.java} | 4 +- .../TableOfContentsClassificationService.java | 114 ++++++++++++------ .../visualization/LayoutDebugLayer.java | 5 - 3 files changed, 80 insertions(+), 43 deletions(-) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/{TextPositionSequenceComparator.java => TocNumberComparator.java} (83%) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TocNumberComparator.java similarity index 83% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TocNumberComparator.java index 3875d7c..27e60d4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TocNumberComparator.java @@ -5,12 +5,12 @@ import java.util.HashMap; import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord; -public class TextPositionSequenceComparator implements Comparator { +public class TocNumberComparator implements Comparator { private HashMap lookup; - public TextPositionSequenceComparator(HashMap lookup) { + public TocNumberComparator(HashMap lookup) { this.lookup = lookup; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java index 0b395cb..53bd0a8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java @@ -28,7 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; @@ -175,12 +175,16 @@ public class TableOfContentsClassificationService { for (int i = 0; i < words.size(); i++) { Word word = words.get(i); + if (!wordIsEndOfLine(i, words)) { + continue; + } if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) { continue; } - if (SectionIdentifier.fromSearchText(word.toString()).level() > 1) { + Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString()); + if (matcher.find() && matcher.group(2) != null) { continue; } @@ -205,6 +209,17 @@ public class TableOfContentsClassificationService { } + private static boolean wordIsEndOfLine(int i, List words) { + + if (i == words.size() - 1) { + return true; + } + Word word = words.get(i); + Word nextWord = words.get(i + 1); + return !nextWord.rightOf(word); + } + + private static CharSequence getSurroundingString(int i, List sequences) { int end = Math.min(i + 5, sequences.size()); @@ -298,62 +313,89 @@ public class TableOfContentsClassificationService { .stream() .filter(cluster -> cluster.size() > MINIMUM_MATCHES) .map(cluster -> cluster.stream() - .sorted(new TextPositionSequenceComparator(lookup)) + .sorted(new TocNumberComparator(lookup)) .toList()) .map(this::removeOutliers) -// .map(this::filterByMinimumDensity) + .map(this::removeOnNonConsecutivePages) + .map(this::filterByWordNearTopOfPage) .filter(cluster -> cluster.size() > MINIMUM_MATCHES) .max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList()); } -// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top, -// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct. -// private List filterByMinimumDensity(List numbers) { -// -// Map> clustersPerPage = numbers.stream() -// .collect(Collectors.groupingBy(number -> lookup.get(number).page())); -// -// List result = new ArrayList<>(numbers.size()); -// clustersPerPage.keySet() -// .stream() -// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber)) -// .forEach(page -> { -// var numbersOnPage = clustersPerPage.get(page); -// -// double height = numbersOnPage.stream() -// .map(BoundingBox::getBBox) -// .collect(RectangleTransformations.collectBBox()).getHeight(); -// -// double count = numbersOnPage.size(); -// -// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) { -// result.addAll(numbers); -// } -// }); -// return result; -// } - - public List removeOutliers(List numbers) { + private List removeOnNonConsecutivePages(List numbers) { List result = new ArrayList<>(); result.add(numbers.get(0)); + for (int i = 1; i < numbers.size(); i++) { + int prev = getPageNumber(numbers, i - 1); + int curr = getPageNumber(numbers, i); + + if (Math.abs(prev - curr) > 1) { + break; + } else { + result.add(numbers.get(i)); + } + } + return result; + } + + + private int getPageNumber(List numbers, int i) { + + return lookup.get(numbers.get(i)).page().getPageNumber(); + } + + + private List filterByWordNearTopOfPage(List numbers) { + + List result = new ArrayList<>(); + + result.add(numbers.get(0)); + + for (int i = 1; i < numbers.size(); i++) { + NumberWord prev = numbers.get(i - 1); + NumberWord curr = numbers.get(i); + ClassificationPage prevPage = lookup.get(prev).page(); + ClassificationPage currPage = lookup.get(curr).page(); + if (prevPage == currPage) { + result.add(curr); + } else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) { + result.add(curr); + } + } + return result; + } + + + public List removeOutliers(List numbers) { + + List confirmedClusterNumbers = new ArrayList<>(); + + confirmedClusterNumbers.add(numbers.get(0)); + for (int i = 1; i < numbers.size() - 1; i++) { int prev = getNumberAsInt(numbers, i - 1); int curr = getNumberAsInt(numbers, i); int next = getNumberAsInt(numbers, i + 1); if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) { - result.add(numbers.get(i)); + confirmedClusterNumbers.add(numbers.get(i)); } } - if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) { - result.add(numbers.get(numbers.size() - 1)); + if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) { + confirmedClusterNumbers.add(numbers.get(numbers.size() - 1)); } - return result; + return confirmedClusterNumbers; + } + + + private static int getLatestNumber(List confirmedClusterNumbers) { + + return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index c48fa6a..c78ee1d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -308,11 +308,6 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { .map(BoundingBox::getBBoxPdf) .map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH)) .toList()); - visualizationsOnPage.getColoredRectangles() - .add(new ColoredRectangle(numbers.stream() - .map(NumberWord::word) - .map(BoundingBox::getBBoxPdf) - .collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH)); }