From 79ac1105f649c992cc6bedfb15027913f8ec5500 Mon Sep 17 00:00:00 2001 From: Yannik Hampe Date: Tue, 27 Feb 2024 08:38:03 +0100 Subject: [PATCH] RED-8481: use visual layout parser for signature extraction --- .../redaction-service-server-v1/build.gradle.kts | 2 +- .../v1/server/model/document/nodes/Document.java | 3 +++ .../v1/server/model/document/nodes/Footer.java | 3 +++ .../v1/server/model/document/nodes/Header.java | 3 +++ .../v1/server/model/document/nodes/Headline.java | 3 +++ .../v1/server/model/document/nodes/Image.java | 11 +++++++++++ .../server/model/document/nodes/Paragraph.java | 4 ++++ .../v1/server/model/document/nodes/Section.java | 6 ++++++ .../model/document/nodes/SemanticNode.java | 9 ++++++++- .../v1/server/model/document/nodes/Table.java | 3 +++ .../server/model/document/nodes/TableCell.java | 4 ++++ .../service/document/DocumentGraphMapper.java | 2 ++ .../document/SemanticNodeComparatorsTest.java | 16 ++++++++-------- .../utils/LayoutParsingRequestProvider.java | 1 + 14 files changed, 60 insertions(+), 10 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/build.gradle.kts b/redaction-service-v1/redaction-service-server-v1/build.gradle.kts index c8582faa..883eb37f 100644 --- a/redaction-service-v1/redaction-service-server-v1/build.gradle.kts +++ b/redaction-service-v1/redaction-service-server-v1/build.gradle.kts @@ -12,7 +12,7 @@ plugins { description = "redaction-service-server-v1" -val layoutParserVersion = "0.91.0" +val layoutParserVersion = "0.93.0" val jacksonVersion = "2.15.2" val droolsVersion = "9.44.0.Final" val pdfBoxVersion = "3.0.0" diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java index 79484363..4208b81f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java @@ -14,6 +14,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -33,6 +34,8 @@ public class Document implements GenericSemanticNode { @EqualsAndHashCode.Include List treeId = Collections.emptyList(); + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); Set pages; DocumentTree documentTree; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Footer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Footer.java index c61bff64..85fcc279 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Footer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Footer.java @@ -9,6 +9,7 @@ import java.util.Set; import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -27,6 +28,8 @@ import lombok.experimental.FieldDefaults; public class Footer implements GenericSemanticNode { final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty(); + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); @EqualsAndHashCode.Include List treeId; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Header.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Header.java index 270a4960..5e0f15bc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Header.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Header.java @@ -9,6 +9,7 @@ import java.util.Set; import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -27,6 +28,8 @@ import lombok.experimental.FieldDefaults; public class Header implements GenericSemanticNode { final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty(); + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); @EqualsAndHashCode.Include List treeId; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Headline.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Headline.java index a2c748e9..5c2fd280 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Headline.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Headline.java @@ -10,6 +10,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -27,6 +28,8 @@ import lombok.experimental.FieldDefaults; @EqualsAndHashCode(onlyExplicitlyIncluded = true) public class Headline implements GenericSemanticNode { + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); @EqualsAndHashCode.Include List treeId; TextBlock leafTextBlock; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java index e591af11..102c9f20 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java @@ -18,6 +18,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRu import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -34,6 +35,8 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) @EqualsAndHashCode(onlyExplicitlyIncluded = true) public class Image implements GenericSemanticNode, IEntity { + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); @EqualsAndHashCode.Include List treeId; @@ -123,6 +126,14 @@ public class Image implements GenericSemanticNode, IEntity { return name.charAt(0) + name.substring(1).toLowerCase(Locale.ENGLISH); } + public boolean mostlyContainedBy(Image image) { + Map bboxImage = image.getBBox(); + Map bbox = this.getBBox(); + Rectangle2D intersection = bboxImage.get(this.page).createIntersection(bbox.get(this.page)); + double calculatedIntersection = intersection.getWidth() * intersection.getHeight(); + double area = bbox.get(this.page).getWidth() * bbox.get(this.page).getHeight(); + return (calculatedIntersection / area) > 0.8; + } public int length() { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Paragraph.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Paragraph.java index db5c38aa..e9061be4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Paragraph.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Paragraph.java @@ -9,6 +9,7 @@ import java.util.Set; import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -24,6 +25,9 @@ import lombok.experimental.FieldDefaults; @EqualsAndHashCode(onlyExplicitlyIncluded = true) public class Paragraph implements GenericSemanticNode { + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); + @EqualsAndHashCode.Include List treeId; TextBlock leafTextBlock; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Section.java index 58259feb..afc87b9f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Section.java @@ -10,12 +10,14 @@ import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntit import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import lombok.AccessLevel; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; +import lombok.RequiredArgsConstructor; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; @@ -26,6 +28,7 @@ import lombok.extern.slf4j.Slf4j; @FieldDefaults(level = AccessLevel.PRIVATE) @EqualsAndHashCode(onlyExplicitlyIncluded = true) public class Section implements GenericSemanticNode { + @EqualsAndHashCode.Include List treeId; @@ -37,6 +40,9 @@ public class Section implements GenericSemanticNode { Map bBoxCache; + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); + @Override public NodeType getType() { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java index d874ea24..b38b97a4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java @@ -22,6 +22,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBl import com.iqser.red.service.redaction.v1.server.service.document.NodeVisitor; import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations; import com.iqser.red.service.redaction.v1.server.utils.RedactionSearchUtility; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; public interface SemanticNode { @@ -356,6 +357,13 @@ public interface SemanticNode { return getTextBlock().getSearchText().contains(string); } + Set getEngines(); + + default void addEngine(LayoutEngine engine) { + getEngines().add(engine); + } + + /** * Checks whether this SemanticNode contains all the provided Strings. @@ -554,7 +562,6 @@ public interface SemanticNode { if (textBlock.containsTextRange(textEntity.getTextRange())) { textEntity.setDeepestFullyContainingNode(this); } - textEntity.addIntersectingNode(this); streamChildren().filter(semanticNode -> semanticNode.getTextRange().intersects(textEntity.getTextRange())) .forEach(node -> node.addThisToEntityIfIntersects(textEntity)); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Table.java index 4d56c729..f6259528 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Table.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Table.java @@ -17,6 +17,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -32,6 +33,8 @@ import lombok.experimental.FieldDefaults; @EqualsAndHashCode(onlyExplicitlyIncluded = true) public class Table implements SemanticNode { + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); @EqualsAndHashCode.Include List treeId; DocumentTree documentTree; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/TableCell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/TableCell.java index 24658449..67026fff 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/TableCell.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/TableCell.java @@ -11,6 +11,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntit import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -26,6 +27,9 @@ import lombok.experimental.FieldDefaults; @EqualsAndHashCode(onlyExplicitlyIncluded = true) public class TableCell implements GenericSemanticNode { + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); + @EqualsAndHashCode.Include List treeId; int row; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java index 3578128b..c8164789 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java @@ -6,6 +6,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import java.util.Set; import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer; @@ -76,6 +77,7 @@ public class DocumentGraphMapper { node.setLeafTextBlock(textBlock); } List treeId = Arrays.stream(entryData.getTreeId()).boxed().toList(); + entryData.getEngines().forEach(engine -> node.addEngine(engine)); node.setTreeId(treeId); switch (entryData.getType()) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/service/document/SemanticNodeComparatorsTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/service/document/SemanticNodeComparatorsTest.java index b68c304e..1c1a5800 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/service/document/SemanticNodeComparatorsTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/service/document/SemanticNodeComparatorsTest.java @@ -16,8 +16,8 @@ class SemanticNodeComparatorsTest { @Test public void testFirstSemanticNode() { - var node = new Section(List.of(0, 1), null, null, null, null); - var otherNode = new Section(List.of(0, 2), null, null, null, null); + var node = new Section(List.of(0, 1), null, null, null, null,null); + var otherNode = new Section(List.of(0, 2), null, null, null, null,null); List list = new ArrayList<>(); list.add(otherNode); list.add(node); @@ -29,8 +29,8 @@ class SemanticNodeComparatorsTest { @Test public void testFirstSemanticNode2() { - var node = new Section(Collections.emptyList(), null, null, null, null); - var otherNode = new Section(List.of(0, 2), null, null, null, null); + var node = new Section(Collections.emptyList(), null, null, null, null, null); + var otherNode = new Section(List.of(0, 2), null, null, null, null, null); List list = new ArrayList<>(); list.add(otherNode); list.add(node); @@ -42,8 +42,8 @@ class SemanticNodeComparatorsTest { @Test public void testFirstSemanticNode3() { - var node = new Section(List.of(1, 5, 8), null, null, null, null); - var otherNode = new Section(List.of(0, 2), null, null, null, null); + var node = new Section(List.of(1, 5, 8), null, null, null, null, null); + var otherNode = new Section(List.of(0, 2), null, null, null, null, null); List list = new ArrayList<>(); list.add(otherNode); list.add(node); @@ -55,8 +55,8 @@ class SemanticNodeComparatorsTest { @Test public void testFirstSemanticNode4() { - var node = new Section(List.of(1, 5, 8), null, null, null, null); - var otherNode = new Section(List.of(1, 5, 9), null, null, null, null); + var node = new Section(List.of(1, 5, 8), null, null, null, null,null); + var otherNode = new Section(List.of(1, 5, 9), null, null, null, null,null); List list = new ArrayList<>(); list.add(otherNode); list.add(node); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/utils/LayoutParsingRequestProvider.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/utils/LayoutParsingRequestProvider.java index f4625012..04c10fc3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/utils/LayoutParsingRequestProvider.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/utils/LayoutParsingRequestProvider.java @@ -29,6 +29,7 @@ public class LayoutParsingRequestProvider { .originFileStorageId(originFileStorageId) .tablesFileStorageId(Optional.of(tablesFileStorageId)) .imagesFileStorageId(Optional.of(imagesFileStorageId)) + .visualLayoutParsingFileId(Optional.empty()) .structureFileStorageId(structureFileStorageId) .textBlockFileStorageId(textBlockFileStorageId) .positionBlockFileStorageId(positionBlockFileStorageId)