From 7ee1f9e360d1cdd1bf85f9441e27fe1ed0e4ce7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Wed, 13 Nov 2024 10:54:39 +0100 Subject: [PATCH] RED-9139: more robust TOC detection --- .../processor/model/SectionIdentifier.java | 2 +- .../text/TextPositionSequenceComparator.java | 34 --- .../model/text/TocNumberComparator.java | 36 ++++ .../services/classification/NumberWord.java | 7 + .../TableOfContentsClassificationService.java | 204 +++++++++++------- .../factory/DocumentGraphFactory.java | 8 +- .../visualization/LayoutDebugLayer.java | 28 ++- .../model/SectionIdentifierTest.java | 7 + .../service/viewerdoc/LayerIdentifier.java | 1 + .../layers/LayoutDebugLayerConfig.java | 4 +- 10 files changed, 210 insertions(+), 121 deletions(-) delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TocNumberComparator.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/NumberWord.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java index 9759695..f828180 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java @@ -16,7 +16,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class SectionIdentifier { - public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?"); + public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?"); public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?"); public enum Format { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java deleted file mode 100644 index b5d328a..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.model.text; - -import java.util.Comparator; -import java.util.HashMap; - -public class TextPositionSequenceComparator implements Comparator { - - private HashMap lookup; - - - public TextPositionSequenceComparator(HashMap lookup) { - - this.lookup = lookup; - } - - - @Override - public int compare(Word number1, Word number2) { - - int page1 = lookup.get(number1).page().getPageNumber(); - int page2 = lookup.get(number2).page().getPageNumber(); - - if (page1 != page2) { - return Integer.compare(page1, page2); - } - - if (number1.getY() != number2.getY()) { - return Double.compare(number1.getY(), number2.getY()); - } - - return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString())); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TocNumberComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TocNumberComparator.java new file mode 100644 index 0000000..27e60d4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TocNumberComparator.java @@ -0,0 +1,36 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.text; + +import java.util.Comparator; +import java.util.HashMap; + +import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord; + +public class TocNumberComparator implements Comparator { + + private HashMap lookup; + + + public TocNumberComparator(HashMap lookup) { + + this.lookup = lookup; + } + + + @Override + public int compare(NumberWord number1, NumberWord number2) { + + int page1 = lookup.get(number1).page().getPageNumber(); + int page2 = lookup.get(number2).page().getPageNumber(); + + if (page1 != page2) { + return Integer.compare(page1, page2); + } + + if (number1.word().getY() != number2.word().getY()) { + return Double.compare(number1.word().getY(), number2.word().getY()); + } + + return Integer.compare(number1.number(), number2.number()); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/NumberWord.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/NumberWord.java new file mode 100644 index 0000000..c8014fd --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/NumberWord.java @@ -0,0 +1,7 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; + +public record NumberWord(Word word, int number) { + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java index c04f7d7..4cc7a86 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java @@ -14,6 +14,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.regex.Matcher; import java.util.stream.Collectors; import org.springframework.stereotype.Service; @@ -23,10 +24,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; @@ -71,9 +73,9 @@ public class TableOfContentsClassificationService { ClassificationPage startPage = textBlocks.get(start).page(); List initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size())); - HashMap lookup = new HashMap<>(); - List numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size()); - TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup); + HashMap numberToBlockLookup = new HashMap<>(); + List numbers = extractNumbers(initialLookAhead, numberToBlockLookup, document.getPages().size()); + TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, numberToBlockLookup); int lastCandidate = start; for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) { @@ -93,28 +95,28 @@ public class TableOfContentsClassificationService { break; } - List numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size()); + List numbersFromBlock = extractNumbers(textBlockOnPage, numberToBlockLookup, document.getPages().size()); - List currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster(); + List currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster(); if (currentRightmostCluster.size() < MINIMUM_MATCHES) { log.debug("No numbers indicating a table of contents here."); return start; } - if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) { + if (anyIntersection(currentRightmostCluster, numbersFromBlock, numberToBlockLookup)) { lastCandidate = i; numbersFromBlock.forEach(tocNumberFinder::add); } } - addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup); - Set blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster() .stream() - .map(lookup::get) + .map(numberToBlockLookup::get) .collect(Collectors.toSet()); + addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, numberToBlockLookup, blocksWithNumberInCluster, textBlocks.get(start - 1)); + int lastConfirmed = start; for (int i = start; i < lastCandidate + 1; i++) { TextBlockOnPage textBlockOnPage = textBlocks.get(i); @@ -132,18 +134,22 @@ public class TableOfContentsClassificationService { } - private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map lookup) { + private static void addVisualization(LayoutDebugLayer layoutDebugLayer, + TocNumberFinder tocNumberFinder, + Map lookup, + Set blocksWithNumberInCluster, + TextBlockOnPage startingHeadline) { tocNumberFinder.getCurrentRightmostCluster() .stream() .collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber())) .forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber)); + layoutDebugLayer.addTocBlocks(blocksWithNumberInCluster); + layoutDebugLayer.addTocBlocks(Set.of(startingHeadline)); } - private static boolean anyIntersection(Collection numbers1, - Collection numbers2, - Map lookup) { + private static boolean anyIntersection(Collection numbers1, Collection numbers2, Map lookup) { return numbers1.stream() .anyMatch(numberFromCluster -> numbers2.stream() @@ -151,9 +157,9 @@ public class TableOfContentsClassificationService { } - private static List extractNumbers(List textBlocks, Map lookup, int numberOfPages) { + private static List extractNumbers(List textBlocks, Map lookup, int numberOfPages) { - List blocks = new LinkedList<>(); + List blocks = new LinkedList<>(); for (TextBlockOnPage textBlock : textBlocks) { blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages)); } @@ -161,30 +167,40 @@ public class TableOfContentsClassificationService { } - private static List extractNumbers(TextBlockOnPage textBlock, Map lookup, int numberOfPages) { + private static List extractNumbers(TextBlockOnPage textBlock, Map lookup, int numberOfPages) { - List blocks = new LinkedList<>(); + List blocks = new LinkedList<>(); TextPageBlock block = textBlock.textBlock(); - List sequences = block.getWords(); - for (int i = 0; i < sequences.size(); i++) { + List words = block.getWords(); + for (int i = 0; i < words.size(); i++) { - Word word = sequences.get(i); - - if (!NUMERIC.matcher(word).matches() || word.length() > 5) { + Word word = words.get(i); + if (!wordIsEndOfLine(i, words)) { continue; } - if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) { + if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) { + continue; + } + + Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString()); + if (matcher.find() && matcher.group(2) != null) { + continue; + } + + Matcher numberFinder = NUMERIC.matcher(word); + if (!numberFinder.find() || word.length() > 5) { continue; } try { - int pageNumber = Integer.parseInt(word.toString()); + int pageNumber = Integer.parseInt(numberFinder.group()); if (0 >= pageNumber || pageNumber > numberOfPages) { continue; } - lookup.put(word, textBlock); - blocks.add(word); + NumberWord numberWord = new NumberWord(word, pageNumber); + lookup.put(numberWord, textBlock); + blocks.add(numberWord); } catch (NumberFormatException e) { log.debug("That wasn't a number! Should not happen, due to numeric check beforehand."); } @@ -193,6 +209,17 @@ public class TableOfContentsClassificationService { } + private static boolean wordIsEndOfLine(int i, List words) { + + if (i == words.size() - 1) { + return true; + } + Word word = words.get(i); + Word nextWord = words.get(i + 1); + return !nextWord.rightOf(word); + } + + private static CharSequence getSurroundingString(int i, List sequences) { int end = Math.min(i + 5, sequences.size()); @@ -203,13 +230,13 @@ public class TableOfContentsClassificationService { } - private static boolean matches(Word number1, Word number2, Map lookup) { + private static boolean matches(NumberWord number1, NumberWord number2, Map lookup) { - if (number1.getDir() != number2.getDir()) { + if (number1.word().getDir() != number2.word().getDir()) { return false; } - return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE); + return number1.word().intersectsXDirAdj(number2.word(), INTERSECTION_TOLERANCE); } @@ -247,11 +274,11 @@ public class TableOfContentsClassificationService { private static class TocNumberFinder { - final UnionFind numberClusters; - final HashMap lookup; + final UnionFind numberClusters; + final HashMap lookup; - TocNumberFinder(List blocks, HashMap lookup) { + TocNumberFinder(List blocks, HashMap lookup) { this.numberClusters = new UnionFind<>(new HashSet<>(blocks)); for (int i = 0; i < blocks.size(); i++) { @@ -265,14 +292,14 @@ public class TableOfContentsClassificationService { } - public void add(Word number) { + public void add(NumberWord number) { if (numberClusters.getElements().contains(number)) { return; } numberClusters.addElement(number); - for (Word element : numberClusters.getElements()) { + for (NumberWord element : numberClusters.getElements()) { if (matches(number, element, lookup)) { numberClusters.union(element, number); } @@ -280,73 +307,100 @@ public class TableOfContentsClassificationService { } - public List getCurrentRightmostCluster() { + public List getCurrentRightmostCluster() { return numberClusters.getGroups() .stream() .filter(cluster -> cluster.size() > MINIMUM_MATCHES) .map(cluster -> cluster.stream() - .sorted(new TextPositionSequenceComparator(lookup)) + .sorted(new TocNumberComparator(lookup)) .toList()) .map(this::removeOutliers) -// .map(this::filterByMinimumDensity) + .map(this::removeOnNonConsecutivePages) + .map(this::filterByWordNearTopOfPage) .filter(cluster -> cluster.size() > MINIMUM_MATCHES) - .max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList()); + .max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList()); } -// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top, -// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct. -// private List filterByMinimumDensity(List numbers) { -// -// Map> clustersPerPage = numbers.stream() -// .collect(Collectors.groupingBy(number -> lookup.get(number).page())); -// -// List result = new ArrayList<>(numbers.size()); -// clustersPerPage.keySet() -// .stream() -// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber)) -// .forEach(page -> { -// var numbersOnPage = clustersPerPage.get(page); -// -// double height = numbersOnPage.stream() -// .map(BoundingBox::getBBox) -// .collect(RectangleTransformations.collectBBox()).getHeight(); -// -// double count = numbersOnPage.size(); -// -// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) { -// result.addAll(numbers); -// } -// }); -// return result; -// } + private List removeOnNonConsecutivePages(List numbers) { - public List removeOutliers(List numbers) { - - List result = new ArrayList<>(); + List result = new ArrayList<>(); result.add(numbers.get(0)); + for (int i = 1; i < numbers.size(); i++) { + int prev = getPageNumber(numbers, i - 1); + int curr = getPageNumber(numbers, i); + + if (Math.abs(prev - curr) > 1) { + break; + } else { + result.add(numbers.get(i)); + } + } + return result; + } + + + private int getPageNumber(List numbers, int i) { + + return lookup.get(numbers.get(i)).page().getPageNumber(); + } + + + private List filterByWordNearTopOfPage(List numbers) { + + List result = new ArrayList<>(); + + result.add(numbers.get(0)); + + for (int i = 1; i < numbers.size(); i++) { + NumberWord prev = numbers.get(i - 1); + NumberWord curr = numbers.get(i); + ClassificationPage prevPage = lookup.get(prev).page(); + ClassificationPage currPage = lookup.get(curr).page(); + if (prevPage.equals(currPage)) { + result.add(curr); + } else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) { + result.add(curr); + } + } + return result; + } + + + public List removeOutliers(List numbers) { + + List confirmedClusterNumbers = new ArrayList<>(); + + confirmedClusterNumbers.add(numbers.get(0)); + for (int i = 1; i < numbers.size() - 1; i++) { int prev = getNumberAsInt(numbers, i - 1); int curr = getNumberAsInt(numbers, i); int next = getNumberAsInt(numbers, i + 1); if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) { - result.add(numbers.get(i)); + confirmedClusterNumbers.add(numbers.get(i)); } } - if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) { - result.add(numbers.get(numbers.size() - 1)); + if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) { + confirmedClusterNumbers.add(numbers.get(numbers.size() - 1)); } - return result; + return confirmedClusterNumbers; + } + + + private static int getLatestNumber(List confirmedClusterNumbers) { + + return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number(); } // Helper method to check if removing the current number results in a better order - public static boolean isBetterWithout(List numbers, int i) { + public static boolean isBetterWithout(List numbers, int i) { if (i == 0 || i == numbers.size() - 1) { return false; @@ -362,9 +416,9 @@ public class TableOfContentsClassificationService { } - private static int getNumberAsInt(List numbers, int i) { + private static int getNumberAsInt(List numbers, int i) { - return Integer.parseInt(numbers.get(i).toString()); + return numbers.get(i).number(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 191e39f..1906c28 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -16,7 +16,7 @@ import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; @@ -82,9 +82,9 @@ public class DocumentGraphFactory { documentGraph.streamAllSubNodes() .filter(SemanticNode::isLeaf) - .filter(node -> !node.getType().equals(NodeType.HEADER)) - .filter(node -> !node.getType().equals(NodeType.FOOTER)) - .filter(node -> !node.getType().equals(NodeType.IMAGE)) + .filter(node -> !node.getType().equals(NodeTypeProto.NodeType.HEADER)) + .filter(node -> !node.getType().equals(NodeTypeProto.NodeType.FOOTER)) + .filter(node -> !node.getType().equals(NodeTypeProto.NodeType.IMAGE)) .map(SemanticNode::getTextBlock) .map(TextBlock::getAtomicTextBlocks) .flatMap(Collection::stream) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index 3a20603..c78ee1d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -27,7 +27,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; @@ -293,7 +295,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } - public void addTocPages(List numbers, int page) { + public void addTocPages(List numbers, int page) { if (!active) { return; @@ -302,13 +304,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages); visualizationsOnPage.getColoredRectangles() .addAll(numbers.stream() + .map(NumberWord::word) .map(BoundingBox::getBBoxPdf) .map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH)) .toList()); - visualizationsOnPage.getColoredRectangles() - .add(new ColoredRectangle(numbers.stream() - .map(BoundingBox::getBBoxPdf) - .collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH)); } @@ -332,8 +331,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) { + if (!active) { + return; + } int rectSize = 5; - Point2D point2D; if (outlineObject.getPoint().isPresent()) { point2D = outlineObject.getPoint().get(); @@ -357,10 +358,25 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { public void addListIdentifiers(List listIdentifiers) { + if (!active) { + return; + } for (ListIdentifier listIdentifier : listIdentifiers) { getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles() .add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH)); } } + + public void addTocBlocks(Set blocksWithNumberInCluster) { + + if (!active) { + return; + } + for (TextBlockOnPage textBlockOnPage : blocksWithNumberInCluster) { + getOrCreateVisualizationsOnPage(textBlockOnPage.page().getPageNumber(), this.tocBlocks).getColoredRectangles() + .add(new ColoredRectangle(textBlockOnPage.textBlock().getBBoxPdf(), TOC_COLOR, LINE_WIDTH)); + } + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifierTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifierTest.java index c4151de..3b188eb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifierTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifierTest.java @@ -83,4 +83,11 @@ class SectionIdentifierTest { assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers()); } + @Test + void testFalsePositive111() { + SectionIdentifier identifier = SectionIdentifier.fromSearchText("111: Headline"); + assertEquals(SectionIdentifier.Format.NUMERICAL, identifier.getFormat()); + assertEquals(1, identifier.level()); + } + } \ No newline at end of file diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java index 331542d..54f2f07 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java @@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) { public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS"); public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES"); public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES"); + public static final LayerIdentifier TOC_BLOCKS = new LayerIdentifier("TOC blocks", "TOC_BLOCKS"); public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS"); // Visual layout parser diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java index da50e01..919a041 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java @@ -22,6 +22,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { protected static final Color WORDS_COLOR = new Color(68, 84, 147); protected static final Color LINES_COLOR = new Color(152, 45, 179); + protected static final Color TOC_COLOR = new Color(33, 159, 144); protected static final Color ZONES_COLOR = new Color(131, 38, 38); protected static final Color RULINGS_COLOR = new Color(21, 221, 174); @@ -59,6 +60,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build(); protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build(); protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build(); + protected final Visualizations tocBlocks = Visualizations.builder().layer(LayerIdentifier.TOC_BLOCKS).build(); protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build(); @@ -77,9 +79,9 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { markedContent, // outlineObjects, // tocPages, // + tocBlocks, // listIdentifiers // ); } - }