From 2fcaeb3d8c0b49504b448649ef9af56679e89d64 Mon Sep 17 00:00:00 2001 From: maverickstuder Date: Tue, 14 May 2024 10:51:05 +0200 Subject: [PATCH] RED-7074: Design Subsection section tree structure algorithm * added supersection and changed logic so that each normal section only contains leaf nodes * added SectionIdentifier logic for headline splitting and merging * fixed many edge cases which resulted in error state files --- .../processor/model/SectionIdentifier.java | 5 +- .../processor/model/graph/nodes/Section.java | 7 +-- .../model/graph/nodes/SuperSection.java | 33 ++++++++++ .../BlockificationPostprocessingService.java | 61 +++++++------------ .../DocstrumBlockificationService.java | 7 ++- .../factory/DocumentGraphFactory.java | 8 ++- .../services/factory/SectionNodeFactory.java | 22 ++++++- .../services/factory/TableNodeFactory.java | 1 + .../visualization/LayoutGridService.java | 5 +- .../server/graph/ViewerDocumentTest.java | 5 +- 10 files changed, 101 insertions(+), 53 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java index 7b6f8c4..dbcb2ce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java @@ -8,6 +8,7 @@ import java.util.regex.Pattern; import lombok.AccessLevel; import lombok.AllArgsConstructor; +import lombok.Getter; import lombok.experimental.FieldDefaults; @AllArgsConstructor @@ -16,13 +17,15 @@ public class SectionIdentifier { static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); - private enum Format { + public enum Format { EMPTY, NUMERICAL, DOCUMENT } + @Getter Format format; + @Getter String identifierString; List identifiers; boolean asChild; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java index 532115a..953af03 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java @@ -20,11 +20,12 @@ import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; import lombok.extern.slf4j.Slf4j; @Slf4j @Data -@Builder +@SuperBuilder @AllArgsConstructor @FieldDefaults(level = AccessLevel.PRIVATE) public class Section implements GenericSemanticNode { @@ -44,10 +45,6 @@ public class Section implements GenericSemanticNode { @EqualsAndHashCode.Exclude Map bBoxCache; - @EqualsAndHashCode.Exclude - boolean isMainSection; - - @Override public NodeType getType() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java new file mode 100644 index 0000000..2876c96 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java @@ -0,0 +1,33 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; + +import java.awt.geom.Rectangle2D; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; + +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.SuperBuilder; + +@Data +@SuperBuilder +@EqualsAndHashCode(callSuper = true) +public class SuperSection extends Section { + + public SuperSection(Set engines, + List treeId, + TextBlock textBlock, + DocumentTree documentTree, + Set entities, + Map bBoxCache) { + + super(engines, treeId, textBlock, documentTree, entities, bBoxCache); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index f79697a..063a209 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -9,10 +9,12 @@ import java.util.ListIterator; import java.util.Locale; import java.util.function.Function; +import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -125,7 +127,7 @@ public class BlockificationPostprocessingService { if (minDistance == distanceToDirectMatch) { directMatch.setClassification(headlineType); } else if (minDistance == distanceToSplitCandidate) { - List others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle()); + List others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle()); splitCandidate.setClassification(headlineType); others.forEach(other -> other.setClassification(null)); } else { @@ -279,38 +281,6 @@ public class BlockificationPostprocessingService { } - private static WordSequenceResultOld findWordSequenceOld(List textPositionSequences, String text) { - - String target = sanitizeString(text); - List inSequence = new ArrayList<>(); - List preSequence = new ArrayList<>(); - StringBuilder currentSequence = new StringBuilder(); - - for (TextPositionSequence sequence : textPositionSequences) { - - currentSequence.append(sanitizeString(sequence.toString())); - inSequence.add(sequence); - - if (currentSequence.length() > target.length()) { - TextPositionSequence removed = inSequence.remove(0); - currentSequence.delete(0, removed.toString().length()); - preSequence.add(removed); - - while (currentSequence.length() > target.length()) { - removed = inSequence.remove(0); - currentSequence.delete(0, removed.toString().length()); - preSequence.add(removed); - } - } - - if (currentSequence.toString().equals(target)) { - return new WordSequenceResultOld(inSequence, preSequence); - } - } - return new WordSequenceResultOld(new ArrayList<>(), new ArrayList<>()); - } - - private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List blocksToMerge) { TextPageBlock firstBlock = blocksToMerge.get(0); @@ -405,8 +375,21 @@ public class BlockificationPostprocessingService { context.mergeCandidates.add(pageBlock); } - if (blockTextContainsOutlineTitle && context.splitCandidate == null) { - context.splitCandidate = pageBlock; + if (blockTextContainsOutlineTitle) { + SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText); + + if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) { + + if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) { + context.directMatch = pageBlock; + return true; + } else if (context.splitCandidate == null) { + context.sectionIdentifier = sectionIdentifier.getIdentifierString(); + } + } + if (context.splitCandidate == null) { + context.splitCandidate = pageBlock; + } } return false; } @@ -414,14 +397,10 @@ public class BlockificationPostprocessingService { private static String sanitizeString(String text) { - return text.replaceAll("\\s", "").toLowerCase(Locale.ROOT); + return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT); } - private record WordSequenceResultOld(List inSequence, List preSequence) { - - } - @Data private static class OutlineProcessionContext { @@ -429,6 +408,7 @@ public class BlockificationPostprocessingService { private OutlineObject outlineObject; private List mergeCandidates; private TextPageBlock splitCandidate; + private String sectionIdentifier; public OutlineProcessionContext(OutlineObject outlineObject) { @@ -437,6 +417,7 @@ public class BlockificationPostprocessingService { this.directMatch = null; this.mergeCandidates = new ArrayList<>(); this.splitCandidate = null; + this.sectionIdentifier = ""; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 5bb9da1..2ab953d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -90,7 +90,7 @@ public class DocstrumBlockificationService { while (itty.hasNext()) { AbstractPageBlock block = itty.next(); - if (block instanceof TablePageBlock || (block.getClassification() != null && block.getClassification().isHeadline())) { + if (block instanceof TablePageBlock || previous.isHeadline()) { previous = new TextPageBlock(); continue; } @@ -98,7 +98,7 @@ public class DocstrumBlockificationService { if (previous != null && !previous.getSequences().isEmpty()) { - if (current.getDir() != previous.getDir()) { + if (current.getDir() != previous.getDir() || current.isHeadline()) { previous = current; continue; } @@ -162,6 +162,9 @@ public class DocstrumBlockificationService { previous.getSequences().addAll(current.getSequences()); previous = buildTextBlock(previous.getSequences(), 0); previous.setToDuplicate(toDuplicate); + if(current.getClassification() != null && previous.getClassification() == null) { + previous.setClassification(current.getClassification()); + } itty.remove(); itty.previous(); itty.set(previous); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index d15b336..b1ad145 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -78,7 +78,13 @@ public class DocumentGraphFactory { for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection(); - Optional
section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document); + Optional
section = SectionNodeFactory.addSection(layoutParsingType, + parent, + tocItem.getChildren().isEmpty(), + tocItem.getNonEmptySectionBlocks(), + tocItem.getImages(), + context, + document); tocItem.setSection(section.orElse(null)); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 459932c..f341572 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -30,6 +31,7 @@ public class SectionNodeFactory { public Optional
addSection(LayoutParsingType layoutParsingType, GenericSemanticNode parentNode, + boolean isLeaf, List pageBlocks, List images, DocumentGraphFactory.Context context, @@ -50,7 +52,13 @@ public class SectionNodeFactory { Map> blocksPerPage = pageBlocks.stream() .collect(groupingBy(AbstractPageBlock::getPage)); - Section section = Section.builder().isMainSection(parentNode == null).documentTree(context.getDocumentTree()).build(); + + Section section; + if (isLeaf) { + section = Section.builder().documentTree(context.getDocumentTree()).build(); + } else { + section = SuperSection.builder().documentTree(context.getDocumentTree()).build(); + } context.getSections().add(section); blocksPerPage.keySet() @@ -60,12 +68,24 @@ public class SectionNodeFactory { addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document); if (containsTablesAndTextBlocks(pageBlocks)) { + + if (pageBlocks.get(0).isHeadline()) { + pageBlocks.remove(0); + } + splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, + true, subSectionPageBlocks, emptyList(), context, document)); + } else if (!isLeaf) { + + if (pageBlocks.get(0).isHeadline()) { + pageBlocks.remove(0); + } + addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document); } else { addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index f71669c..1a097fc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -146,6 +146,7 @@ public class TableNodeFactory { } else if (firstTextBlockIsHeadline(cell)) { SectionNodeFactory.addSection(layoutParsingType, tableCell, + true, cell.getTextBlocks() .stream() .map(tb -> (AbstractPageBlock) tb) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 9159742..82e9b42 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -179,10 +179,11 @@ public class LayoutGridService { Map bBoxMap = semanticNode.getBBox(); List subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList(); Page firstPage = semanticNode.getFirstPage(); + String treeIdString = buildTreeIdString(semanticNode); if (!subSections.isEmpty()) { - addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid); + addPlacedText(firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid); } else { - bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, buildTreeIdString(semanticNode), layoutGrid))); + bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, treeIdString, layoutGrid))); } if (bBoxMap.values().size() == 1) { Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 4a590c6..37e02e7 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -83,6 +83,10 @@ public class ViewerDocumentTest extends BuildDocumentTest { public void testViewerDocument() { + //String fileName = "files/new/UTT-Books-53.pdf"; + String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf"; + + //String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf"; //String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf"; //String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf"; @@ -95,7 +99,6 @@ public class ViewerDocumentTest extends BuildDocumentTest { //String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; //String fileName = "files/new/mistitled_outlines_example.pdf"; //String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf"; - String fileName = "files/new/UTT-Books-53.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile();