RED-7141: Adapted to layout parser using docstrum

2024-03-05 13:06:36 +01:00 · 2024-03-05 13:06:36 +01:00 · b93f9a2c20
commit b93f9a2c20
parent fa55917a89
13 changed files with 90 additions and 30621 deletions
--- a/redaction-service-v1/redaction-service-server-v1/build.gradle.kts
+++ b/redaction-service-v1/redaction-service-server-v1/build.gradle.kts
@ -12,7 +12,7 @@ plugins {
 description = "redaction-service-server-v1"


-val layoutParserVersion = "0.94.0"
+val layoutParserVersion = "0.96.0"
 val jacksonVersion = "2.15.2"
 val droolsVersion = "9.44.0.Final"
 val pdfBoxVersion = "3.0.0"
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java
@ -57,7 +57,7 @@ public class Document implements GenericSemanticNode {
    public TextBlock getTextBlock() {

        if (textBlock == null) {
-            textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
+            textBlock = GenericSemanticNode.super.getTextBlock();
        }
        return textBlock;
    }
@ -72,8 +72,7 @@ public class Document implements GenericSemanticNode {

    public Stream<TextBlock> streamTerminalTextBlocksInOrder() {

-        return streamAllNodes().filter(SemanticNode::isLeaf)
-                .map(SemanticNode::getLeafTextBlock);
+        return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock);
    }


--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/DuplicatedParagraph.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/DuplicatedParagraph.java
@ -0,0 +1,34 @@
+package com.iqser.red.service.redaction.v1.server.model.document.nodes;
+
+import java.util.stream.Stream;
+
+import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
+import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
+
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.experimental.SuperBuilder;
+
+@Data
+@EqualsAndHashCode(callSuper = true)
+@SuperBuilder
+public class DuplicatedParagraph extends Paragraph {
+
+    TextBlock unsortedLeafTextBlock;
+
+
+    @Override
+    public TextBlock getTextBlock() {
+
+        return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
+
+    }
+
+
+    @Override
+    public String toString() {
+
+        return super.toString();
+    }
+
+}
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Paragraph.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Paragraph.java
@ -17,11 +17,12 @@ import lombok.Builder;
 import lombok.Data;
 import lombok.EqualsAndHashCode;
 import lombok.experimental.FieldDefaults;
+import lombok.experimental.SuperBuilder;

@Data
-@Builder
+@SuperBuilder
@AllArgsConstructor
-@FieldDefaults(level = AccessLevel.PRIVATE)
+@FieldDefaults(level = AccessLevel.PROTECTED)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
 public class Paragraph implements GenericSemanticNode {

--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Section.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Section.java
@ -69,9 +69,7 @@ public class Section implements GenericSemanticNode {
    public TextBlock getTextBlock() {

        if (textBlock == null) {
-            textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
-                    .map(SemanticNode::getLeafTextBlock)
-                    .collect(new TextBlockCollector());
+            textBlock = GenericSemanticNode.super.getTextBlock();
        }
        return textBlock;
    }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java
@ -19,6 +19,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
 import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
 import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
 import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
+import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
 import com.iqser.red.service.redaction.v1.server.service.document.NodeVisitor;
 import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
 import com.iqser.red.service.redaction.v1.server.utils.RedactionSearchUtility;
@ -41,7 +42,11 @@ public interface SemanticNode {
     *
     * @return TextBlock containing all AtomicTextBlocks that are located under this Node.
     */
-    TextBlock getTextBlock();
+    default TextBlock getTextBlock() {
+
+        return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
+    }
+


    /**
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Table.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Table.java
@ -408,9 +408,7 @@ public class Table implements SemanticNode {
    public TextBlock getTextBlock() {

        if (textBlock == null) {
-            textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
-                    .map(SemanticNode::getLeafTextBlock)
-                    .collect(new TextBlockCollector());
+            textBlock = SemanticNode.super.getTextBlock();
        }
        return textBlock;
    }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java
@ -9,6 +9,7 @@ import java.util.NoSuchElementException;
 import java.util.Set;

 import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
+import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
 import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
 import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
 import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
@ -66,7 +67,7 @@ public class DocumentGraphMapper {

            SemanticNode node = switch (entryData.getType()) {
                case SECTION -> buildSection(context);
-                case PARAGRAPH -> buildParagraph(context);
+                case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
                case HEADLINE -> buildHeadline(context);
                case HEADER -> buildHeader(context);
                case FOOTER -> buildFooter(context);
@ -148,13 +149,22 @@ public class DocumentGraphMapper {
    }


-    private Paragraph buildParagraph(Context context) {
+    private Paragraph buildParagraph(Context context, Map<String, String> properties) {
+
+        if (PropertiesMapper.isDuplicateParagraph(properties)) {
+
+            DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
+
+            Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
+            duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
+            return duplicatedParagraph;
+        }

        return Paragraph.builder().documentTree(context.documentTree).build();
    }


-    private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
+        private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {

        return Arrays.stream(atomicTextBlockIds)
                .map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/PropertiesMapper.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/PropertiesMapper.java
@ -49,4 +49,23 @@ public class PropertiesMapper {
        return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
    }

+
+    public static boolean isDuplicateParagraph(Map<String, String> properties) {
+
+        return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
+    }
+
+
+    public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
+
+        return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
+    }
+
+
+    public static Long[] toLongArray(String ids) {
+
+        return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
+    }
+
+
 }
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java
@ -246,6 +246,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {


    @Test
+    @SneakyThrows
    public void redactionExpansionOverlap() {

        // F. Lastname, J. Doe, M. Mustermann
@ -297,7 +298,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
    @Test
    public void titleExtraction() throws IOException {

-        AnalyzeRequest request = uploadFileToStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
+        AnalyzeRequest request = uploadFileToStorage("files/new/crafted document.pdf");
        System.out.println("Start Full integration test");
        analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
        System.out.println("Finished structure analysis");
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/DocumentIEntityInsertionIntegrationTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/DocumentIEntityInsertionIntegrationTest.java
@ -272,7 +272,7 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
    public void assertTextBeforeAndTextAfterForHeadlineMetolachlor() {

        Document document = buildGraph("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
-        String searchTerm = "absorption, distribution, metabolism";
+        String searchTerm = "of subject matter";
        int start = document.getTextBlock().indexOf(searchTerm);
        assert start != -1;
        start = document.getTextBlock().indexOf(searchTerm, start + 1);
@ -282,16 +282,16 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
        TextEntity textEntity = TextEntity.initialEntityNode(textRange, "123", EntityType.ENTITY, document);
        entityCreationService.addEntityToGraph(textEntity, document);

-        assertEquals("2.6.1 Summary of ", textEntity.getTextBefore());
-        assertEquals(" and excretion in", textEntity.getTextAfter());
-        assertEquals("2.6.1 Summary of absorption, distribution, metabolism and excretion in mammals ",
+        assertEquals("1 Statement ", textEntity.getTextBefore());
+        assertEquals(" and purpose for", textEntity.getTextAfter());
+        assertEquals("1 Statement of subject matter and purpose for which this report has been prepared and background information on the application ",
                     textEntity.getDeepestFullyContainingNode().getHeadline().getTextBlock().getSearchText());
        assertEquals(searchTerm, textEntity.getValue());
        assertEquals(3, textEntity.getIntersectingNodes().size());
-        assertEquals(4, textEntity.getDeepestFullyContainingNode().getNumberOnPage());
+        assertEquals(1, textEntity.getDeepestFullyContainingNode().getNumberOnPage());
        assertTrue(textEntity.getPages()
                           .stream()
-                           .allMatch(pageNode -> pageNode.getNumber() == 33));
+                           .allMatch(pageNode -> pageNode.getNumber() == 9));
        assertInstanceOf(Headline.class, textEntity.getDeepestFullyContainingNode());

        assertSameOffsetInAllIntersectingNodes(searchTerm, textEntity);
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/ner_entities/S-Metolachlor_RAR_02_Volume_2_2018-09-06.NER_ENTITIES.json
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/ner_entities/S-Metolachlor_RAR_02_Volume_2_2018-09-06.NER_ENTITIES.json
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/ner_entities/crafted
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/ner_entities/crafted