diff --git a/redaction-service-v1/redaction-service-server-v1/build.gradle.kts b/redaction-service-v1/redaction-service-server-v1/build.gradle.kts index 35bb6b18..117257a9 100644 --- a/redaction-service-v1/redaction-service-server-v1/build.gradle.kts +++ b/redaction-service-v1/redaction-service-server-v1/build.gradle.kts @@ -12,7 +12,7 @@ plugins { description = "redaction-service-server-v1" -val layoutParserVersion = "0.107.0" +val layoutParserVersion = "0.116.0" val jacksonVersion = "2.15.2" val droolsVersion = "9.44.0.Final" val pdfBoxVersion = "3.0.0" diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/DocumentTree.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/DocumentTree.java index 375a9080..8527cff0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/DocumentTree.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/DocumentTree.java @@ -2,6 +2,7 @@ package com.iqser.red.service.redaction.v1.server.model.document; import static java.lang.String.format; +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -122,6 +123,68 @@ public class DocumentTree { } + /** + * Finds all child nodes of the specified entry, whose nodes textRange intersects the given textRange. It achieves this by finding the first entry, whose textRange contains the start idx of the TextRange using a binary search. + * It then iterates over the remaining children adding them to the intersections, until one does not contain the end of the TextRange. All intersected Entries are returned as SemanticNodes. + * + * @param treeId the treeId of the Entry whose children shall be checked. + * @param textRange The TextRange to find intersecting childNodes for. + * @return A list of all SemanticNodes, that are direct children of the specified Entry, whose TextRange intersects the given TextRange + */ + public List findIntersectingChildNodes(List treeId, TextRange textRange) { + + List childEntries = getEntryById(treeId).getChildren(); + List intersectingChildEntries = new LinkedList<>(); + int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start()); + if (startIdx < 0) { + return intersectingChildEntries; + } + for (int i = startIdx; i < childEntries.size(); i++) { + if (childEntries.get(i).getNode().getTextRange().start() < textRange.end()) { + intersectingChildEntries.add(childEntries.get(i).getNode()); + } else { + break; + } + } + return intersectingChildEntries; + } + + + public Optional findFirstContainingChild(List treeId, TextRange textRange) { + + List childEntries = getEntryById(treeId).getChildren(); + int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start()); + if (startIdx < 0) { + return Optional.empty(); + } + + if (childEntries.get(startIdx).getNode().getTextRange().contains(textRange.end())) { + return Optional.of(childEntries.get(startIdx).getNode()); + } + + return Optional.empty(); + } + + + private int findFirstIdxOfContainingChildBinarySearch(List childNodes, int start) { + + int low = 0; + int high = childNodes.size() - 1; + while (low <= high) { + int mid = low + (high - low) / 2; + TextRange range = childNodes.get(mid).getNode().getTextRange(); + if (range.start() > start) { + high = mid - 1; + } else if (range.end() <= start) { + low = mid + 1; + } else { + return mid; + } + } + return -1; + } + + public Stream childNodesOfType(List treeId, NodeType nodeType) { return getEntryById(treeId).children.stream() @@ -252,7 +315,7 @@ public class DocumentTree { List treeId; SemanticNode node; @Builder.Default - List children = new LinkedList<>(); + List children = new ArrayList<>(); @Override diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/TextRange.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/TextRange.java index ef3ad47d..d0f2e8a5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/TextRange.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/TextRange.java @@ -165,7 +165,8 @@ public class TextRange implements Comparable { } List splitBoundaries = new LinkedList<>(); int previousIndex = start; - for (int splitIndex : splitIndices) { + for (int i = 0, splitIndicesSize = splitIndices.size(); i < splitIndicesSize; i++) { + int splitIndex = splitIndices.get(i); // skip split if it would produce a boundary of length 0 if (splitIndex == previousIndex) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java index 5ac8bed9..f58aacd9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java @@ -47,6 +47,8 @@ public class Image implements GenericSemanticNode, IEntity { List treeId; String id; + TextBlock leafTextBlock; + ImageType imageType; boolean transparent; Rectangle2D position; @@ -57,14 +59,11 @@ public class Image implements GenericSemanticNode, IEntity { @Builder.Default ManualChangeOverwrite manualOverwrite = new ManualChangeOverwrite(); - @EqualsAndHashCode.Exclude Page page; - @EqualsAndHashCode.Exclude DocumentTree documentTree; @Builder.Default - @EqualsAndHashCode.Exclude Set entities = new HashSet<>(); @@ -78,9 +77,7 @@ public class Image implements GenericSemanticNode, IEntity { @Override public TextBlock getTextBlock() { - return streamAllSubNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock) - .collect(new TextBlockCollector()); + return leafTextBlock; } @@ -94,15 +91,21 @@ public class Image implements GenericSemanticNode, IEntity { @Override public TextRange getTextRange() { - return GenericSemanticNode.super.getTextRange(); + return leafTextBlock.getTextRange(); + } + + + @Override + public int length() { + + return getTextRange().length(); } @Override public String type() { - return getManualOverwrite().getType() - .orElse(imageType.toString().toLowerCase(Locale.ENGLISH)); + return getManualOverwrite().getType().orElse(imageType.toString().toLowerCase(Locale.ENGLISH)); } @@ -160,10 +163,4 @@ public class Image implements GenericSemanticNode, IEntity { return (area / calculatedIntersection) > containmentThreshold; } - - public int length() { - - return 0; - } - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java index b148832b..1c262b1d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java @@ -626,7 +626,7 @@ public interface SemanticNode { textEntity.setDeepestFullyContainingNode(this); } textEntity.addIntersectingNode(this); - streamChildren().filter(semanticNode -> semanticNode.getTextRange().intersects(textEntity.getTextRange())) + getDocumentTree().findIntersectingChildNodes(getTreeId(), textEntity.getTextRange()) .forEach(node -> node.addThisToEntityIfIntersects(textEntity)); } } @@ -714,8 +714,7 @@ public interface SemanticNode { if (isLeaf()) { return getTextBlock().getPositionsPerPage(textRange); } - Optional containingChildNode = streamChildren().filter(child -> child.getTextRange().contains(textRange)) - .findFirst(); + Optional containingChildNode = getDocumentTree().findFirstContainingChild(getTreeId(), textRange); if (containingChildNode.isEmpty()) { return getTextBlock().getPositionsPerPage(textRange); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/MessageReceiver.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/MessageReceiver.java index fdd9c96d..c3766d19 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/MessageReceiver.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/MessageReceiver.java @@ -17,7 +17,7 @@ public class MessageReceiver { @RabbitHandler - @RabbitListener(queues = REDACTION_QUEUE) + @RabbitListener(queues = REDACTION_QUEUE, concurrency = "1") public void receiveAnalyzeRequest(Message message) { redactionMessageReceiver.receiveAnalyzeRequest(message, false); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/PriorityMessageReceiver.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/PriorityMessageReceiver.java index 9e099b3d..93f86ee8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/PriorityMessageReceiver.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/PriorityMessageReceiver.java @@ -17,7 +17,7 @@ public class PriorityMessageReceiver { @RabbitHandler - @RabbitListener(queues = REDACTION_PRIORITY_QUEUE) + @RabbitListener(queues = REDACTION_PRIORITY_QUEUE, concurrency = "1") public void receiveAnalyzeRequest(Message message) { redactionMessageReceiver.receiveAnalyzeRequest(message, true); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java index cd783be3..3a8bdf6a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.service.document; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.LinkedList; @@ -8,23 +9,23 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; +import com.iqser.red.service.redaction.v1.server.model.document.DocumentData; import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table; import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell; import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector; -import com.iqser.red.service.redaction.v1.server.model.document.DocumentData; -import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; -import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline; -import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; @@ -58,7 +59,7 @@ public class DocumentGraphMapper { private List buildEntries(List entries, Context context) { - List newEntries = new LinkedList<>(); + List newEntries = new ArrayList<>(entries.size()); for (DocumentStructure.EntryData entryData : entries) { List pages = Arrays.stream(entryData.getPageNumbers()) @@ -191,8 +192,7 @@ public class DocumentGraphMapper { return context.pageData.stream() .filter(page -> page.getNumber() == Math.toIntExact(pageIndex)) - .findFirst() - .orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex))); + .findFirst().orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex))); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityCreationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityCreationService.java index 819a9e23..04d2d408 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityCreationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityCreationService.java @@ -1,14 +1,18 @@ package com.iqser.red.service.redaction.v1.server.service.document; -import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.*; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.addEntityToNodeEntitySets; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.addToPages; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.allEntitiesIntersectAndHaveSameTypes; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.checkIfBothStartAndEndAreEmpty; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.findIntersectingSubNodes; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.toLineAfterTextRange; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.truncateEndIfLineBreakIsBetween; import static com.iqser.red.service.redaction.v1.server.utils.SeparatorUtils.boundaryIsSurroundedBySeparators; import java.util.Collection; -import java.util.Collections; import java.util.Comparator; import java.util.LinkedList; import java.util.List; -import java.util.NoSuchElementException; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -985,7 +989,7 @@ public class EntityCreationService { .peek(e -> e.addEngines(engines)) .findAny(); if (optionalTextEntity.isEmpty()) { - return optionalTextEntity; // Entity has been recategorized and should not be created at all. + return Optional.empty(); // Entity has been recategorized and should not be created at all. } TextEntity existingEntity = optionalTextEntity.get(); if (existingEntity.getTextRange().equals(textRange)) { @@ -997,7 +1001,7 @@ public class EntityCreationService { } return Optional.empty(); // Entity has been resized, if there are duplicates they should be treated there } - addEntityToGraph(entity, node); + addEntityToGraph(entity, node.getDocumentTree()); entity.addEngines(engines); insertToKieSession(entity); return Optional.of(entity); @@ -1027,15 +1031,16 @@ public class EntityCreationService { /** - * Merges a list of text entities into a single entity, assuming they intersect and are of the same type. - * * @param entitiesToMerge The list of entities to merge. * @param type The type for the merged entity. * @param entityType The entity's classification. * @param node The semantic node related to these entities. * @return A single merged {@link TextEntity}. * @throws IllegalArgumentException If entities do not intersect or have different types. + * @deprecated Do not use anymore. This might not work correctly due to duplicate textranges not being taken into account here. + * Merges a list of text entities into a single entity, assuming they intersect and are of the same type. */ + @Deprecated(forRemoval = true) public TextEntity mergeEntitiesOfSameType(List entitiesToMerge, String type, EntityType entityType, SemanticNode node) { if (!allEntitiesIntersectAndHaveSameTypes(entitiesToMerge)) { @@ -1070,6 +1075,8 @@ public class EntityCreationService { mergedEntity.setDossierDictionaryEntry(entitiesToMerge.stream() .anyMatch(TextEntity::isDossierDictionaryEntry)); + entityEnrichmentService.enrichEntity(mergedEntity, node.getTextBlock()); + addEntityToGraph(mergedEntity, node); insertToKieSession(mergedEntity); @@ -1245,38 +1252,19 @@ public class EntityCreationService { public void addEntityToGraph(TextEntity entity, SemanticNode node) { DocumentTree documentTree = node.getDocumentTree(); - try { - if (node.getEntities().contains(entity)) { - // If entity already exists and it has a different text range, we add the text range to the list of duplicated text ranges - Optional optionalTextEntity = node.getEntities() - .stream()// - .filter(e -> e.equals(entity))// - .filter(e -> !e.getTextRange().equals(entity.getTextRange()))// - .findAny(); - if (optionalTextEntity.isPresent()) { - addDuplicateEntityToGraph(optionalTextEntity.get(), entity.getTextRange(), node); - } else { - node.getEntities().remove(entity); - addNewEntityToGraph(entity, documentTree); - } + if (node.getEntities().contains(entity)) { + // If entity already exists and it has a different text range, we add the text range to the list of duplicated text ranges + node.getEntities() + .stream()// + .filter(e -> e.equals(entity))// + .filter(e -> !e.getTextRange().equals(entity.getTextRange()))// + .findAny() + .ifPresent(e -> addDuplicateEntityToGraph(e, entity.getTextRange(), node)); - } else { - entity.addIntersectingNode(documentTree.getRoot().getNode()); - addEntityToGraph(entity, documentTree); - } - } catch (NoSuchElementException e) { - addNewEntityToGraph(entity, documentTree); + } else { + addEntityToGraph(entity, documentTree); } - } - - private void addNewEntityToGraph(TextEntity entity, DocumentTree documentTree) { - - entity.setDeepestFullyContainingNode(documentTree.getRoot().getNode()); - entityEnrichmentService.enrichEntity(entity, entity.getDeepestFullyContainingNode().getTextBlock()); - entity.addIntersectingNode(documentTree.getRoot().getNode()); - addToPages(entity); - addEntityToNodeEntitySets(entity); } @@ -1312,12 +1300,7 @@ public class EntityCreationService { private void addEntityToGraph(TextEntity entity, DocumentTree documentTree) { - SemanticNode containingNode = documentTree.childNodes(Collections.emptyList()) - .filter(node -> node.getTextBlock().containsTextRange(entity.getTextRange())) - .findFirst() - .orElseThrow(() -> new NoSuchElementException("No containing Node found!")); - - containingNode.addThisToEntityIfIntersects(entity); + documentTree.getRoot().getNode().addThisToEntityIfIntersects(entity); TextBlock textBlock = entity.getDeepestFullyContainingNode().getTextBlock(); entityEnrichmentService.enrichEntity(entity, textBlock); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java new file mode 100644 index 00000000..e79035e5 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java @@ -0,0 +1,327 @@ +package com.iqser.red.service.redaction.v1.server; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.when; + +import java.io.File; +import java.io.FileInputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.context.annotation.Import; +import org.springframework.core.io.ClassPathResource; +import org.springframework.test.context.junit.jupiter.SpringExtension; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Sets; +import com.iqser.red.commons.jackson.ObjectMapperFactory; +import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel; +import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest; +import com.iqser.red.service.persistence.service.v1.api.shared.model.RuleFileType; +import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.ManualRedactions; +import com.iqser.red.service.persistence.service.v1.api.shared.model.common.JSONPrimitive; +import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; +import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; +import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; +import com.iqser.red.service.redaction.v1.server.client.RulesClient; +import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary; +import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrement; +import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel; +import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion; +import com.iqser.red.service.redaction.v1.server.service.AnalyzeService; +import com.iqser.red.service.redaction.v1.server.service.DictionaryService; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; +import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException; +import com.iqser.red.storage.commons.service.StorageService; +import com.knecon.fforesight.tenantcommons.TenantsClient; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@ExtendWith(SpringExtension.class) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) +@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class) +@Disabled +/* + * This test is meant to be used directly with a download from blob storage (e.g. minio). You need to define the dossier template you want to use by supplying an absolute path. + * The dossier template will then be parsed for dictionaries, colors, entities, and rules. This is defined for the all tests once. + * Inside a test you supply a path to your minio download folder. The files should still be zipped in this folder. + * The files will then be checked for completeness and uploaded to the FileSystemBackedStorageService. + * This way you can recreate what is happening on the stack almost exactly. + */ public class AnalysisEnd2EndTest { + + Path dossierTemplateToUse = Path.of("/home/kschuettler/iqser/business-logic/redactmanager/prod-cp-eu-reg/EFSA_sanitisation_GFL_v1"); // Add your dossier-template here + ObjectMapper mapper = ObjectMapperFactory.create(); + final String TENANT_ID = "tenant"; + + @Autowired + StorageService storageService; + + @Autowired + protected AnalyzeService analyzeService; + + @MockBean + DictionaryService dictionaryService; + + @MockBean + RabbitTemplate rabbitTemplate; + + TestDossierTemplate testDossierTemplate; + @MockBean + protected LegalBasisClient legalBasisClient; + + @MockBean + private TenantsClient tenantsClient; + + @MockBean + protected RulesClient rulesClient; + + @MockBean + protected DictionaryClient dictionaryClient; + + + @Test + @SneakyThrows + public void runAnalysisEnd2End() { + + String folder = "files/end2end/file0"; // Should contain all files from minio directly, still zipped. Can contain multiple files. + + Path absoluteFolderPath; + if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path + ClassPathResource classPathResource = new ClassPathResource(folder); + absoluteFolderPath = classPathResource.getFile().toPath(); + } else { + absoluteFolderPath = Path.of(folder); + } + + log.info("Starting end2end analyses for all distinct filenames in folder: {}", folder); + List analyzeRequests = prepareStorageForFolder(absoluteFolderPath); + log.info("Found {} distinct fileIds", analyzeRequests.size()); + for (int i = 0; i < analyzeRequests.size(); i++) { + AnalyzeRequest analyzeRequest = analyzeRequests.get(i); + log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId()); + analyzeService.analyze(analyzeRequest); + } + } + + + @BeforeEach + public void setup() { + + testDossierTemplate = new TestDossierTemplate(dossierTemplateToUse); + when(dictionaryService.updateDictionary(any(), any())).thenReturn(new DictionaryVersion(0, 0)); + when(dictionaryService.getDeepCopyDictionary(any(), any())).thenReturn(testDossierTemplate.testDictionary); + when(dictionaryService.getDictionaryIncrements(any(), any(), any())).thenReturn(new DictionaryIncrement(Collections.emptySet(), new DictionaryVersion(0, 0))); + when(dictionaryService.isHint(any(String.class), any())).thenAnswer(invocation -> { + String type = invocation.getArgument(0); + return testDossierTemplate.testDictionary.getType(type).isHint(); + }); + when(dictionaryService.getColor(any(String.class), any())).thenAnswer(invocation -> { + String type = invocation.getArgument(0); + return testDossierTemplate.testDictionary.getType(type).getColor(); + }); + when(dictionaryService.getNotRedactedColor(any())).thenReturn(new float[]{0.2f, 0.2f, 0.2f}); + + when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(System.currentTimeMillis()); + when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(JSONPrimitive.of(testDossierTemplate.rules)); + when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.COMPONENT)).thenReturn(testDossierTemplate.componentRules != null ? System.currentTimeMillis() : -1); + when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.COMPONENT)).thenReturn(JSONPrimitive.of(testDossierTemplate.componentRules)); + } + + + @SneakyThrows + private List prepareStorageForFolder(Path folder) { + + return Files.list(folder) + .map(this::parseFileId) + .distinct() + .map(fileId -> prepareStorageForFile(fileId, folder)) + .toList(); + } + + + private String parseFileId(Path path) { + + return path.getFileName().toString().split("\\.")[0]; + } + + + @SneakyThrows + private AnalyzeRequest prepareStorageForFile(String fileId, Path folder) { + + AnalyzeRequest request = new AnalyzeRequest(); + request.setDossierId(UUID.randomUUID().toString()); + request.setFileId(UUID.randomUUID().toString()); + request.setDossierTemplateId(testDossierTemplate.id); + request.setManualRedactions(new ManualRedactions()); + request.setAnalysisNumber(-1); + + Set endingsToUpload = Set.of("ORIGIN", + "DOCUMENT_PAGES", + "DOCUMENT_POSITION", + "DOCUMENT_STRUCTURE", + "DOCUMENT_TEXT", + "IMAGE_INFO", + "NER_ENTITIES", + "TABLES", + "IMPORTED_REDACTIONS") + .stream() + .map(FileType::valueOf) + .collect(Collectors.toSet()); + + Set uploadedFileTypes = Files.walk(folder) + .filter(path -> path.toFile().isFile()) + .filter(path -> endingsToUpload.contains(parseFileTypeFromPath(path))) + .map(filePath -> uploadFile(filePath, request)) + .collect(Collectors.toUnmodifiableSet()); + + Set missingFileTypes = Sets.difference(endingsToUpload, uploadedFileTypes); + if (!missingFileTypes.isEmpty()) { + log.error("Folder {} is missing files of type {}", + folder.toFile(), + missingFileTypes.stream() + .map(Enum::toString) + .collect(Collectors.joining(", "))); + throw new NotFoundException("Not all required file types are present."); + } + return request; + } + + + private static FileType parseFileTypeFromPath(Path path) { + + return FileType.valueOf(path.getFileName().toString().split("\\.")[1]); + } + + + @SneakyThrows + private FileType uploadFile(Path path, AnalyzeRequest request) { + + FileType fileType = parseFileTypeFromPath(path); + try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) { + storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType), in); + + } + return fileType; + } + + + private class TestDossierTemplate { + + String id; + Dictionary testDictionary; + AtomicInteger dictEntryIdCounter = new AtomicInteger(0); + String rules; + String componentRules; + + + @SneakyThrows + TestDossierTemplate(Path dossierTemplateToUse) { + + Map dossierTemplate = mapper.readValue(dossierTemplateToUse.resolve("dossierTemplate.json").toFile(), HashMap.class); + this.id = (String) dossierTemplate.get("dossierTemplateId"); + List dictionaries = Files.walk(dossierTemplateToUse) + .filter(path -> path.getFileName().toString().equals("dossierType.json")) + .map(this::loadDictionaryModel) + .toList(); + File ruleFile = dossierTemplateToUse.resolve("rules.drl").toFile(); + rules = new String(Files.readAllBytes(ruleFile.toPath())); + + File componentRuleFile = dossierTemplateToUse.resolve("componentRules.drl").toFile(); + if (componentRuleFile.exists()) { + componentRules = new String(Files.readAllBytes(componentRuleFile.toPath())); + } + + testDictionary = new Dictionary(dictionaries, new DictionaryVersion(0, 0)); + } + + + @SneakyThrows + private DictionaryModel loadDictionaryModel(Path path) { + + Map model = mapper.readValue(path.toFile(), HashMap.class); + Set entries = new HashSet<>(); + Set falsePositives = new HashSet<>(); + Set falseRecommendations = new HashSet<>(); + + String type = (String) model.get("type"); + Integer rank = (Integer) model.get("rank"); + float[] color = hexToFloatArr((String) model.get("hexColor")); + Boolean caseInsensitive = (Boolean) model.get("caseInsensitive"); + Boolean hint = (Boolean) model.get("hint"); + Boolean hasDictionary = (Boolean) model.get("hasDictionary"); + + boolean isDossierDictionary; + if (model.containsKey("dossierDictionaryOnly")) { + isDossierDictionary = true; + } else { + isDossierDictionary = ((String) model.get("id")).split(":").length == 3; + } + + if (hasDictionary) { + try (var in = new FileInputStream(path.getParent().resolve("entries.txt").toFile())) { + entries.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId"))); + } + try (var in = new FileInputStream(path.getParent().resolve("falsePositives.txt").toFile())) { + falsePositives.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId"))); + } + try (var in = new FileInputStream(path.getParent().resolve("falseRecommendations.txt").toFile())) { + falseRecommendations.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId"))); + } + } + + return new DictionaryModel(type, rank, color, caseInsensitive, hint, entries, falsePositives, falseRecommendations, isDossierDictionary); + } + + + private Set parseDictionaryEntryModelFromFile(String s, AtomicInteger dictEntryIdCounter, String typeId) { + + String[] values = s.split("\n"); + return Arrays.stream(values) + .map(value -> new DictionaryEntryModel(dictEntryIdCounter.getAndIncrement(), value, 0L, false, typeId)) + .collect(Collectors.toUnmodifiableSet()); + } + + + private float[] hexToFloatArr(String hexColor) { + + // Remove # symbol if present + String cleanHexColor = hexColor.replace("#", ""); + + // Parse hex string into RGB components + int r = Integer.parseInt(cleanHexColor.substring(0, 2), 16); + int g = Integer.parseInt(cleanHexColor.substring(2, 4), 16); + int b = Integer.parseInt(cleanHexColor.substring(4, 6), 16); + + // Normalize RGB values to floats between 0 and 1 + float[] rgbFloat = new float[3]; + rgbFloat[0] = r / 255.0f; + rgbFloat[1] = g / 255.0f; + rgbFloat[2] = b / 255.0f; + + return rgbFloat; + } + + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/log4j2-test.xml b/redaction-service-v1/redaction-service-server-v1/src/test/resources/log4j2-test.xml deleted file mode 100644 index b4895cfb..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/log4j2-test.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - - - - - - - - - - - diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/logback-spring.xml b/redaction-service-v1/redaction-service-server-v1/src/test/resources/logback-spring.xml new file mode 100644 index 00000000..33b2cef7 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/logback-spring.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file