From bc605aec8cdd34279b7e82060cbf43ef7d8611ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Thu, 15 Jun 2023 21:44:25 +0200 Subject: [PATCH] RED-6009: Document Tree Structure --- .../server/layoutparsing/document/graph/Boundary.java | 3 +++ .../graph/textblock/ConcatenatedTextBlock.java | 4 ++++ .../document/services/EntityCreationService.java | 10 +++++++--- .../redaction/v1/server/RedactionIntegrationTest.java | 6 ++++-- .../src/test/resources/drools/rules.drl | 11 ++++++++++- 5 files changed, 28 insertions(+), 6 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/Boundary.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/Boundary.java index e581082c..845ce384 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/Boundary.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/Boundary.java @@ -147,6 +147,9 @@ public class Boundary implements Comparable { */ public Boundary trim(TextBlock textBlock) { + if (textBlock.isEmpty()) { + return textBlock.getBoundary(); + } int trimmedStart = this.start; while (Character.isWhitespace(textBlock.charAt(trimmedStart))) { trimmedStart++; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/ConcatenatedTextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/ConcatenatedTextBlock.java index e53fe5c8..6fda9ad8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/ConcatenatedTextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/ConcatenatedTextBlock.java @@ -149,6 +149,10 @@ public class ConcatenatedTextBlock implements TextBlock { List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary); + if (textBlocks.isEmpty()) { + return new HashMap<>(); + } + if (textBlocks.size() == 1) { return textBlocks.get(0).getPositionsPerPage(stringBoundary); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java index 5e60d66f..c37bb0a2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java @@ -188,13 +188,17 @@ public class EntityCreationService { } - public RedactionEntity bySemanticNode(SemanticNode node, String type, EntityType entityType) { + public Optional bySemanticNode(SemanticNode node, String type, EntityType entityType) { Boundary boundary = node.getTextBlock().getBoundary(); + if (boundary.length() > 0) { boundary = new Boundary(boundary.start(), boundary.end() - 1); } - return byBoundary(boundary, type, entityType, node); + if (!isValidEntityBoundary(node.getTextBlock(), boundary)) { + return Optional.empty(); + } + return Optional.of(byBoundary(boundary, type, entityType, node)); } @@ -294,7 +298,7 @@ public class EntityCreationService { private boolean isValidEntityBoundary(TextBlock textBlock, Boundary boundary) { - return boundaryIsSurroundedBySeparators(textBlock, boundary); + return boundary.length() > 0 && boundaryIsSurroundedBySeparators(textBlock, boundary); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 5627dcfa..65112ab7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -2,6 +2,8 @@ package com.iqser.red.service.redaction.v1.server; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.when; import static org.wildfly.common.Assert.assertTrue; @@ -220,7 +222,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest { @Test public void titleExtraction() throws IOException { - AnalyzeRequest request = uploadFileToStorage("files/new/crafted document.pdf"); + AnalyzeRequest request = uploadFileToStorage("files/new/S157.pdf"); System.out.println("Start Full integration test"); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); System.out.println("Finished structure analysis"); @@ -298,7 +300,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest { ClassLoader loader = getClass().getClassLoader(); URL url = loader.getResource("files"); Path path = Paths.get(URI.create(url.toString())); - + when(dictionaryClient.getDictionaryForType(anyString(), anyLong())).thenReturn(new Type()); Files.walk(path)// .filter(currentPath -> currentPath.toString().endsWith(".pdf"))// .map(currentPath -> path.getParent().relativize(currentPath))// diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 9ed55619..74cb4267 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -6,10 +6,11 @@ import static com.iqser.red.service.redaction.v1.server.layoutparsing.document.u import java.util.List; import java.util.LinkedList; -import java.util.Set +import java.util.Set; import java.util.stream.Collectors; import java.util.Collection; import java.util.stream.Stream; +import java.util.Optional; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.*; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.*; @@ -249,6 +250,8 @@ rule "CBI.9.0: Redact all Cell's with Header Author(s) as CBI_author (non verteb then $table.streamTableCellsWithHeader("Author(s)") .map(tableCell -> entityCreationService.bySemanticNode(tableCell, "CBI_author", EntityType.ENTITY)) + .filter(Optional::isPresent) + .map(Optional::get) .forEach(redactionEntity -> { redactionEntity.setRedaction(true); redactionEntity.addMatchedRule(9); @@ -267,6 +270,8 @@ rule "CBI.9.1: Redact all Cell's with Header Author as CBI_author (non vertebrat then $table.streamTableCellsWithHeader("Author") .map(tableCell -> entityCreationService.bySemanticNode(tableCell, "CBI_author", EntityType.ENTITY)) + .filter(Optional::isPresent) + .map(Optional::get) .forEach(redactionEntity -> { redactionEntity.setRedaction(true); redactionEntity.addMatchedRule(9); @@ -300,6 +305,8 @@ rule "CBI.12.0: Add all Cell's with Header Author(s) as CBI_author" $table.streamTableCellsWithHeader("Author") ) .map(tableCell -> entityCreationService.bySemanticNode(tableCell, "CBI_author", EntityType.ENTITY)) + .filter(Optional::isPresent) + .map(Optional::get) .forEach(redactionEntity -> { redactionEntity.addMatchedRule(12); redactionEntity.setRedactionReason("Author(s) header found"); @@ -998,6 +1005,8 @@ rule "ETC.6.0: Redact CAS Number" then $table.streamTableCellsWithHeader("Sample #") .map(tableCell -> entityCreationService.bySemanticNode(tableCell, "PII", EntityType.ENTITY)) + .filter(Optional::isPresent) + .map(Optional::get) .forEach(redactionEntity -> { redactionEntity.setRedaction(true); redactionEntity.addMatchedRule(101);