From e9b406af16694affcd35fc64e7338ee2296d9b0a Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 10 Oct 2024 10:34:59 +0200 Subject: [PATCH] RED-10127: add list classification * refactor headline font sizes * remove title case * no real drawbacks, mostly edge cases * +0.1% F1 Score (2 Files with +8%) --- .../processor/LayoutParsingPipeline.java | 9 +- .../outline/OutlineValidationService.java | 17 ++- .../DocuMineClassificationService.java | 136 +++++++++--------- .../server/OutlineDetectionTest.java | 6 +- 4 files changed, 88 insertions(+), 80 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 57a0a4a..906b71e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -374,14 +374,7 @@ public class LayoutParsingPipeline { classificationService.classify(classificationDocument, layoutParsingType, identifier); - List headlines = classificationDocument.getPages() - .stream() - .flatMap(classificationPage -> classificationPage.getTextBlocks() - .stream() - .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) - .map(tb -> (TextPageBlock) tb)) - .toList(); - TableOfContents tableOfContents = outlineValidationService.createToC(headlines); + TableOfContents tableOfContents = outlineValidationService.createToC(classificationDocument); classificationDocument.setTableOfContents(tableOfContents); log.info("Building Sections for {}", identifier); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java index 1c8c2bf..e40ad72 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java @@ -10,6 +10,7 @@ import java.util.TreeSet; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import io.micrometer.observation.annotation.Observed; @@ -20,7 +21,9 @@ import lombok.extern.slf4j.Slf4j; public class OutlineValidationService { @Observed(name = "OutlineValidationService", contextualName = "create-toc") - public TableOfContents createToC(List headlines) { + public TableOfContents createToC(ClassificationDocument classificationDocument) { + + List headlines = extractHeadlines(classificationDocument); List mainSections = new ArrayList<>(); Map lastItemsPerDepth = new HashMap<>(); @@ -60,4 +63,16 @@ public class OutlineValidationService { return new TableOfContents(mainSections); } + + private static List extractHeadlines(ClassificationDocument classificationDocument) { + + return classificationDocument.getPages() + .stream() + .flatMap(classificationPage -> classificationPage.getTextBlocks() + .stream() + .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) + .map(tb -> (TextPageBlock) tb)) + .toList(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index d9e8642..a072748 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -62,74 +62,6 @@ public class DocuMineClassificationService { } - private static List buildHeadlineFontSizes(ClassificationDocument document) { - - if (document.getFontSizeCounter().getCountPerValue().size() <= 6) { - return document.getFontSizeCounter().getValuesInReverseOrder(); - } - - List> sortedEntries = new ArrayList<>(document.getFontSizeCounter().getCountPerValue().entrySet()); - sortedEntries.sort(Map.Entry.comparingByKey()); - - int totalCount = sortedEntries.stream() - .mapToInt(Map.Entry::getValue).sum(); - - int cumulativeCount = 0; - Iterator> iterator = sortedEntries.iterator(); - while (iterator.hasNext()) { - Map.Entry entry = iterator.next(); - cumulativeCount += entry.getValue(); - if (cumulativeCount > totalCount * 0.3) { - break; // We've filtered the bottom 30%, so stop. - } - iterator.remove(); - } - - if (sortedEntries.size() < 6) { - return document.getFontSizeCounter().getValuesInReverseOrder(); - } - int clusterSize = Math.max(1, sortedEntries.size() / 6); - - List> clusters = new ArrayList<>(); - for (int i = 0; i < 6; i++) { - clusters.add(new ArrayList<>()); - } - - for (int i = 0; i < sortedEntries.size(); i++) { - int clusterIndex = Math.min(i / clusterSize, 5); - clusters.get(clusterIndex).add(sortedEntries.get(i).getKey()); - } - - return clusters.stream() - .map(cluster -> cluster.stream() - .mapToDouble(d -> d).average() - .orElseThrow()) - .sorted(Comparator.reverseOrder()) - .toList(); - } - - - private List getSurroundingBlocksOnPage(int originalIndex, List textBlocks) { - - int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0); - int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); - List surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS); - for (int i = start; i < end; i++) { - if (i == originalIndex) { - continue; - } - if (textBlocks.get(i).block().getText().length() <= 1) { - continue; - } - if (textBlocks.get(i).page() != textBlocks.get(originalIndex).page()) { - continue; - } - surroundingBlocks.add(textBlocks.get(i).block()); - } - return surroundingBlocks; - } - - private void classifyBlock(HeadlineClassificationService headlineClassificationService, int currentIndex, List allBlocks, @@ -331,6 +263,74 @@ public class DocuMineClassificationService { return blocks; } + + private static List buildHeadlineFontSizes(ClassificationDocument document) { + + if (document.getFontSizeCounter().getCountPerValue().size() <= 6) { + return document.getFontSizeCounter().getValuesInReverseOrder(); + } + + List> sortedEntries = new ArrayList<>(document.getFontSizeCounter().getCountPerValue().entrySet()); + sortedEntries.sort(Map.Entry.comparingByKey()); + + int totalCount = sortedEntries.stream() + .mapToInt(Map.Entry::getValue).sum(); + + int cumulativeCount = 0; + Iterator> iterator = sortedEntries.iterator(); + while (iterator.hasNext()) { + Map.Entry entry = iterator.next(); + cumulativeCount += entry.getValue(); + if (cumulativeCount > totalCount * 0.3) { + break; // We've filtered the bottom 30%, so stop. + } + iterator.remove(); + } + + if (sortedEntries.size() < 6) { + return document.getFontSizeCounter().getValuesInReverseOrder(); + } + int clusterSize = Math.max(1, sortedEntries.size() / 6); + + List> clusters = new ArrayList<>(); + for (int i = 0; i < 6; i++) { + clusters.add(new ArrayList<>()); + } + + for (int i = 0; i < sortedEntries.size(); i++) { + int clusterIndex = Math.min(i / clusterSize, 5); + clusters.get(clusterIndex).add(sortedEntries.get(i).getKey()); + } + + return clusters.stream() + .map(cluster -> cluster.stream() + .mapToDouble(d -> d).average() + .orElseThrow()) + .sorted(Comparator.reverseOrder()) + .toList(); + } + + + private List getSurroundingBlocksOnPage(int originalIndex, List textBlocks) { + + int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0); + int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); + List surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS); + for (int i = start; i < end; i++) { + if (i == originalIndex) { + continue; + } + if (textBlocks.get(i).block().getText().length() <= 1) { + continue; + } + if (!textBlocks.get(i).page().equals(textBlocks.get(originalIndex).page())) { + continue; + } + surroundingBlocks.add(textBlocks.get(i).block()); + } + return surroundingBlocks; + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java index c548a81..374e8f5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java @@ -79,7 +79,7 @@ public class OutlineDetectionTest extends AbstractTest { var documentFile = new ClassPathResource(fileName).getFile(); long start = System.currentTimeMillis(); - ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH); + ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.DOCUMINE_OLD); Document document = buildGraph(fileName, classificationDocument); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree(); @@ -102,7 +102,7 @@ public class OutlineDetectionTest extends AbstractTest { TableOfContents tableOfContents = classificationDocument.getTableOfContents(); - assertEquals(tableOfContents.getMainSections().size(), 10); + assertEquals(tableOfContents.getMainSections().size(), 9); assertEquals(tableOfContents.getMainSections().subList(1, 9) .stream() .map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString())) @@ -135,7 +135,7 @@ public class OutlineDetectionTest extends AbstractTest { List childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection(); - assertEquals(childrenOfTypeSectionOrSuperSection.size(), 10); + assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9); assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9) .stream() .map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))