Merge branch 'RED-7384-fp' into 'master'
RED-7384: improve performance significantly Closes RED-7384 See merge request redactmanager/redaction-service!385
This commit is contained in:
commit
dce2d1b898
@ -12,7 +12,7 @@ plugins {
|
||||
description = "redaction-service-server-v1"
|
||||
|
||||
|
||||
val layoutParserVersion = "0.107.0"
|
||||
val layoutParserVersion = "0.116.0"
|
||||
val jacksonVersion = "2.15.2"
|
||||
val droolsVersion = "9.44.0.Final"
|
||||
val pdfBoxVersion = "3.0.0"
|
||||
|
||||
@ -2,6 +2,7 @@ package com.iqser.red.service.redaction.v1.server.model.document;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -122,6 +123,68 @@ public class DocumentTree {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds all child nodes of the specified entry, whose nodes textRange intersects the given textRange. It achieves this by finding the first entry, whose textRange contains the start idx of the TextRange using a binary search.
|
||||
* It then iterates over the remaining children adding them to the intersections, until one does not contain the end of the TextRange. All intersected Entries are returned as SemanticNodes.
|
||||
*
|
||||
* @param treeId the treeId of the Entry whose children shall be checked.
|
||||
* @param textRange The TextRange to find intersecting childNodes for.
|
||||
* @return A list of all SemanticNodes, that are direct children of the specified Entry, whose TextRange intersects the given TextRange
|
||||
*/
|
||||
public List<SemanticNode> findIntersectingChildNodes(List<Integer> treeId, TextRange textRange) {
|
||||
|
||||
List<Entry> childEntries = getEntryById(treeId).getChildren();
|
||||
List<SemanticNode> intersectingChildEntries = new LinkedList<>();
|
||||
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
|
||||
if (startIdx < 0) {
|
||||
return intersectingChildEntries;
|
||||
}
|
||||
for (int i = startIdx; i < childEntries.size(); i++) {
|
||||
if (childEntries.get(i).getNode().getTextRange().start() < textRange.end()) {
|
||||
intersectingChildEntries.add(childEntries.get(i).getNode());
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return intersectingChildEntries;
|
||||
}
|
||||
|
||||
|
||||
public Optional<SemanticNode> findFirstContainingChild(List<Integer> treeId, TextRange textRange) {
|
||||
|
||||
List<Entry> childEntries = getEntryById(treeId).getChildren();
|
||||
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
|
||||
if (startIdx < 0) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
if (childEntries.get(startIdx).getNode().getTextRange().contains(textRange.end())) {
|
||||
return Optional.of(childEntries.get(startIdx).getNode());
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private int findFirstIdxOfContainingChildBinarySearch(List<Entry> childNodes, int start) {
|
||||
|
||||
int low = 0;
|
||||
int high = childNodes.size() - 1;
|
||||
while (low <= high) {
|
||||
int mid = low + (high - low) / 2;
|
||||
TextRange range = childNodes.get(mid).getNode().getTextRange();
|
||||
if (range.start() > start) {
|
||||
high = mid - 1;
|
||||
} else if (range.end() <= start) {
|
||||
low = mid + 1;
|
||||
} else {
|
||||
return mid;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
|
||||
|
||||
return getEntryById(treeId).children.stream()
|
||||
@ -252,7 +315,7 @@ public class DocumentTree {
|
||||
List<Integer> treeId;
|
||||
SemanticNode node;
|
||||
@Builder.Default
|
||||
List<Entry> children = new LinkedList<>();
|
||||
List<Entry> children = new ArrayList<>();
|
||||
|
||||
|
||||
@Override
|
||||
|
||||
@ -165,7 +165,8 @@ public class TextRange implements Comparable<TextRange> {
|
||||
}
|
||||
List<TextRange> splitBoundaries = new LinkedList<>();
|
||||
int previousIndex = start;
|
||||
for (int splitIndex : splitIndices) {
|
||||
for (int i = 0, splitIndicesSize = splitIndices.size(); i < splitIndicesSize; i++) {
|
||||
int splitIndex = splitIndices.get(i);
|
||||
|
||||
// skip split if it would produce a boundary of length 0
|
||||
if (splitIndex == previousIndex) {
|
||||
|
||||
@ -47,6 +47,8 @@ public class Image implements GenericSemanticNode, IEntity {
|
||||
List<Integer> treeId;
|
||||
String id;
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
ImageType imageType;
|
||||
boolean transparent;
|
||||
Rectangle2D position;
|
||||
@ -57,14 +59,11 @@ public class Image implements GenericSemanticNode, IEntity {
|
||||
@Builder.Default
|
||||
ManualChangeOverwrite manualOverwrite = new ManualChangeOverwrite();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Page page;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<TextEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@ -78,9 +77,7 @@ public class Image implements GenericSemanticNode, IEntity {
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@ -94,15 +91,21 @@ public class Image implements GenericSemanticNode, IEntity {
|
||||
@Override
|
||||
public TextRange getTextRange() {
|
||||
|
||||
return GenericSemanticNode.super.getTextRange();
|
||||
return leafTextBlock.getTextRange();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
return getTextRange().length();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String type() {
|
||||
|
||||
return getManualOverwrite().getType()
|
||||
.orElse(imageType.toString().toLowerCase(Locale.ENGLISH));
|
||||
return getManualOverwrite().getType().orElse(imageType.toString().toLowerCase(Locale.ENGLISH));
|
||||
}
|
||||
|
||||
|
||||
@ -160,10 +163,4 @@ public class Image implements GenericSemanticNode, IEntity {
|
||||
return (area / calculatedIntersection) > containmentThreshold;
|
||||
}
|
||||
|
||||
|
||||
public int length() {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -626,7 +626,7 @@ public interface SemanticNode {
|
||||
textEntity.setDeepestFullyContainingNode(this);
|
||||
}
|
||||
textEntity.addIntersectingNode(this);
|
||||
streamChildren().filter(semanticNode -> semanticNode.getTextRange().intersects(textEntity.getTextRange()))
|
||||
getDocumentTree().findIntersectingChildNodes(getTreeId(), textEntity.getTextRange())
|
||||
.forEach(node -> node.addThisToEntityIfIntersects(textEntity));
|
||||
}
|
||||
}
|
||||
@ -714,8 +714,7 @@ public interface SemanticNode {
|
||||
if (isLeaf()) {
|
||||
return getTextBlock().getPositionsPerPage(textRange);
|
||||
}
|
||||
Optional<SemanticNode> containingChildNode = streamChildren().filter(child -> child.getTextRange().contains(textRange))
|
||||
.findFirst();
|
||||
Optional<SemanticNode> containingChildNode = getDocumentTree().findFirstContainingChild(getTreeId(), textRange);
|
||||
if (containingChildNode.isEmpty()) {
|
||||
return getTextBlock().getPositionsPerPage(textRange);
|
||||
}
|
||||
|
||||
@ -17,7 +17,7 @@ public class MessageReceiver {
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = REDACTION_QUEUE)
|
||||
@RabbitListener(queues = REDACTION_QUEUE, concurrency = "1")
|
||||
public void receiveAnalyzeRequest(Message message) {
|
||||
|
||||
redactionMessageReceiver.receiveAnalyzeRequest(message, false);
|
||||
|
||||
@ -17,7 +17,7 @@ public class PriorityMessageReceiver {
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = REDACTION_PRIORITY_QUEUE)
|
||||
@RabbitListener(queues = REDACTION_PRIORITY_QUEUE, concurrency = "1")
|
||||
public void receiveAnalyzeRequest(Message message) {
|
||||
|
||||
redactionMessageReceiver.receiveAnalyzeRequest(message, true);
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.service.document;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
@ -8,23 +9,23 @@ import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.DocumentData;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.DocumentData;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
@ -58,7 +59,7 @@ public class DocumentGraphMapper {
|
||||
|
||||
private List<DocumentTree.Entry> buildEntries(List<DocumentStructure.EntryData> entries, Context context) {
|
||||
|
||||
List<DocumentTree.Entry> newEntries = new LinkedList<>();
|
||||
List<DocumentTree.Entry> newEntries = new ArrayList<>(entries.size());
|
||||
for (DocumentStructure.EntryData entryData : entries) {
|
||||
|
||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
|
||||
@ -191,8 +192,7 @@ public class DocumentGraphMapper {
|
||||
|
||||
return context.pageData.stream()
|
||||
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
|
||||
.findFirst().orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,14 +1,18 @@
|
||||
package com.iqser.red.service.redaction.v1.server.service.document;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.*;
|
||||
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.addEntityToNodeEntitySets;
|
||||
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.addToPages;
|
||||
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.allEntitiesIntersectAndHaveSameTypes;
|
||||
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.checkIfBothStartAndEndAreEmpty;
|
||||
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.findIntersectingSubNodes;
|
||||
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.toLineAfterTextRange;
|
||||
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.truncateEndIfLineBreakIsBetween;
|
||||
import static com.iqser.red.service.redaction.v1.server.utils.SeparatorUtils.boundaryIsSurroundedBySeparators;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
@ -985,7 +989,7 @@ public class EntityCreationService {
|
||||
.peek(e -> e.addEngines(engines))
|
||||
.findAny();
|
||||
if (optionalTextEntity.isEmpty()) {
|
||||
return optionalTextEntity; // Entity has been recategorized and should not be created at all.
|
||||
return Optional.empty(); // Entity has been recategorized and should not be created at all.
|
||||
}
|
||||
TextEntity existingEntity = optionalTextEntity.get();
|
||||
if (existingEntity.getTextRange().equals(textRange)) {
|
||||
@ -997,7 +1001,7 @@ public class EntityCreationService {
|
||||
}
|
||||
return Optional.empty(); // Entity has been resized, if there are duplicates they should be treated there
|
||||
}
|
||||
addEntityToGraph(entity, node);
|
||||
addEntityToGraph(entity, node.getDocumentTree());
|
||||
entity.addEngines(engines);
|
||||
insertToKieSession(entity);
|
||||
return Optional.of(entity);
|
||||
@ -1027,15 +1031,16 @@ public class EntityCreationService {
|
||||
|
||||
|
||||
/**
|
||||
* Merges a list of text entities into a single entity, assuming they intersect and are of the same type.
|
||||
*
|
||||
* @param entitiesToMerge The list of entities to merge.
|
||||
* @param type The type for the merged entity.
|
||||
* @param entityType The entity's classification.
|
||||
* @param node The semantic node related to these entities.
|
||||
* @return A single merged {@link TextEntity}.
|
||||
* @throws IllegalArgumentException If entities do not intersect or have different types.
|
||||
* @deprecated Do not use anymore. This might not work correctly due to duplicate textranges not being taken into account here.
|
||||
* Merges a list of text entities into a single entity, assuming they intersect and are of the same type.
|
||||
*/
|
||||
@Deprecated(forRemoval = true)
|
||||
public TextEntity mergeEntitiesOfSameType(List<TextEntity> entitiesToMerge, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
if (!allEntitiesIntersectAndHaveSameTypes(entitiesToMerge)) {
|
||||
@ -1070,6 +1075,8 @@ public class EntityCreationService {
|
||||
mergedEntity.setDossierDictionaryEntry(entitiesToMerge.stream()
|
||||
.anyMatch(TextEntity::isDossierDictionaryEntry));
|
||||
|
||||
entityEnrichmentService.enrichEntity(mergedEntity, node.getTextBlock());
|
||||
|
||||
addEntityToGraph(mergedEntity, node);
|
||||
insertToKieSession(mergedEntity);
|
||||
|
||||
@ -1245,38 +1252,19 @@ public class EntityCreationService {
|
||||
public void addEntityToGraph(TextEntity entity, SemanticNode node) {
|
||||
|
||||
DocumentTree documentTree = node.getDocumentTree();
|
||||
try {
|
||||
if (node.getEntities().contains(entity)) {
|
||||
// If entity already exists and it has a different text range, we add the text range to the list of duplicated text ranges
|
||||
Optional<TextEntity> optionalTextEntity = node.getEntities()
|
||||
.stream()//
|
||||
.filter(e -> e.equals(entity))//
|
||||
.filter(e -> !e.getTextRange().equals(entity.getTextRange()))//
|
||||
.findAny();
|
||||
if (optionalTextEntity.isPresent()) {
|
||||
addDuplicateEntityToGraph(optionalTextEntity.get(), entity.getTextRange(), node);
|
||||
} else {
|
||||
node.getEntities().remove(entity);
|
||||
addNewEntityToGraph(entity, documentTree);
|
||||
}
|
||||
if (node.getEntities().contains(entity)) {
|
||||
// If entity already exists and it has a different text range, we add the text range to the list of duplicated text ranges
|
||||
node.getEntities()
|
||||
.stream()//
|
||||
.filter(e -> e.equals(entity))//
|
||||
.filter(e -> !e.getTextRange().equals(entity.getTextRange()))//
|
||||
.findAny()
|
||||
.ifPresent(e -> addDuplicateEntityToGraph(e, entity.getTextRange(), node));
|
||||
|
||||
} else {
|
||||
entity.addIntersectingNode(documentTree.getRoot().getNode());
|
||||
addEntityToGraph(entity, documentTree);
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
addNewEntityToGraph(entity, documentTree);
|
||||
} else {
|
||||
addEntityToGraph(entity, documentTree);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addNewEntityToGraph(TextEntity entity, DocumentTree documentTree) {
|
||||
|
||||
entity.setDeepestFullyContainingNode(documentTree.getRoot().getNode());
|
||||
entityEnrichmentService.enrichEntity(entity, entity.getDeepestFullyContainingNode().getTextBlock());
|
||||
entity.addIntersectingNode(documentTree.getRoot().getNode());
|
||||
addToPages(entity);
|
||||
addEntityToNodeEntitySets(entity);
|
||||
}
|
||||
|
||||
|
||||
@ -1312,12 +1300,7 @@ public class EntityCreationService {
|
||||
|
||||
private void addEntityToGraph(TextEntity entity, DocumentTree documentTree) {
|
||||
|
||||
SemanticNode containingNode = documentTree.childNodes(Collections.emptyList())
|
||||
.filter(node -> node.getTextBlock().containsTextRange(entity.getTextRange()))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException("No containing Node found!"));
|
||||
|
||||
containingNode.addThisToEntityIfIntersects(entity);
|
||||
documentTree.getRoot().getNode().addThisToEntityIfIntersects(entity);
|
||||
|
||||
TextBlock textBlock = entity.getDeepestFullyContainingNode().getTextBlock();
|
||||
entityEnrichmentService.enrichEntity(entity, textBlock);
|
||||
|
||||
@ -0,0 +1,327 @@
|
||||
package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.RuleFileType;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.ManualRedactions;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.common.JSONPrimitive;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrement;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion;
|
||||
import com.iqser.red.service.redaction.v1.server.service.AnalyzeService;
|
||||
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class)
|
||||
@Disabled
|
||||
/*
|
||||
* This test is meant to be used directly with a download from blob storage (e.g. minio). You need to define the dossier template you want to use by supplying an absolute path.
|
||||
* The dossier template will then be parsed for dictionaries, colors, entities, and rules. This is defined for the all tests once.
|
||||
* Inside a test you supply a path to your minio download folder. The files should still be zipped in this folder.
|
||||
* The files will then be checked for completeness and uploaded to the FileSystemBackedStorageService.
|
||||
* This way you can recreate what is happening on the stack almost exactly.
|
||||
*/ public class AnalysisEnd2EndTest {
|
||||
|
||||
Path dossierTemplateToUse = Path.of("/home/kschuettler/iqser/business-logic/redactmanager/prod-cp-eu-reg/EFSA_sanitisation_GFL_v1"); // Add your dossier-template here
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
final String TENANT_ID = "tenant";
|
||||
|
||||
@Autowired
|
||||
StorageService storageService;
|
||||
|
||||
@Autowired
|
||||
protected AnalyzeService analyzeService;
|
||||
|
||||
@MockBean
|
||||
DictionaryService dictionaryService;
|
||||
|
||||
@MockBean
|
||||
RabbitTemplate rabbitTemplate;
|
||||
|
||||
TestDossierTemplate testDossierTemplate;
|
||||
@MockBean
|
||||
protected LegalBasisClient legalBasisClient;
|
||||
|
||||
@MockBean
|
||||
private TenantsClient tenantsClient;
|
||||
|
||||
@MockBean
|
||||
protected RulesClient rulesClient;
|
||||
|
||||
@MockBean
|
||||
protected DictionaryClient dictionaryClient;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void runAnalysisEnd2End() {
|
||||
|
||||
String folder = "files/end2end/file0"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
|
||||
|
||||
Path absoluteFolderPath;
|
||||
if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path
|
||||
ClassPathResource classPathResource = new ClassPathResource(folder);
|
||||
absoluteFolderPath = classPathResource.getFile().toPath();
|
||||
} else {
|
||||
absoluteFolderPath = Path.of(folder);
|
||||
}
|
||||
|
||||
log.info("Starting end2end analyses for all distinct filenames in folder: {}", folder);
|
||||
List<AnalyzeRequest> analyzeRequests = prepareStorageForFolder(absoluteFolderPath);
|
||||
log.info("Found {} distinct fileIds", analyzeRequests.size());
|
||||
for (int i = 0; i < analyzeRequests.size(); i++) {
|
||||
AnalyzeRequest analyzeRequest = analyzeRequests.get(i);
|
||||
log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId());
|
||||
analyzeService.analyze(analyzeRequest);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void setup() {
|
||||
|
||||
testDossierTemplate = new TestDossierTemplate(dossierTemplateToUse);
|
||||
when(dictionaryService.updateDictionary(any(), any())).thenReturn(new DictionaryVersion(0, 0));
|
||||
when(dictionaryService.getDeepCopyDictionary(any(), any())).thenReturn(testDossierTemplate.testDictionary);
|
||||
when(dictionaryService.getDictionaryIncrements(any(), any(), any())).thenReturn(new DictionaryIncrement(Collections.emptySet(), new DictionaryVersion(0, 0)));
|
||||
when(dictionaryService.isHint(any(String.class), any())).thenAnswer(invocation -> {
|
||||
String type = invocation.getArgument(0);
|
||||
return testDossierTemplate.testDictionary.getType(type).isHint();
|
||||
});
|
||||
when(dictionaryService.getColor(any(String.class), any())).thenAnswer(invocation -> {
|
||||
String type = invocation.getArgument(0);
|
||||
return testDossierTemplate.testDictionary.getType(type).getColor();
|
||||
});
|
||||
when(dictionaryService.getNotRedactedColor(any())).thenReturn(new float[]{0.2f, 0.2f, 0.2f});
|
||||
|
||||
when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(System.currentTimeMillis());
|
||||
when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(JSONPrimitive.of(testDossierTemplate.rules));
|
||||
when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.COMPONENT)).thenReturn(testDossierTemplate.componentRules != null ? System.currentTimeMillis() : -1);
|
||||
when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.COMPONENT)).thenReturn(JSONPrimitive.of(testDossierTemplate.componentRules));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<AnalyzeRequest> prepareStorageForFolder(Path folder) {
|
||||
|
||||
return Files.list(folder)
|
||||
.map(this::parseFileId)
|
||||
.distinct()
|
||||
.map(fileId -> prepareStorageForFile(fileId, folder))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private String parseFileId(Path path) {
|
||||
|
||||
return path.getFileName().toString().split("\\.")[0];
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private AnalyzeRequest prepareStorageForFile(String fileId, Path folder) {
|
||||
|
||||
AnalyzeRequest request = new AnalyzeRequest();
|
||||
request.setDossierId(UUID.randomUUID().toString());
|
||||
request.setFileId(UUID.randomUUID().toString());
|
||||
request.setDossierTemplateId(testDossierTemplate.id);
|
||||
request.setManualRedactions(new ManualRedactions());
|
||||
request.setAnalysisNumber(-1);
|
||||
|
||||
Set<FileType> endingsToUpload = Set.of("ORIGIN",
|
||||
"DOCUMENT_PAGES",
|
||||
"DOCUMENT_POSITION",
|
||||
"DOCUMENT_STRUCTURE",
|
||||
"DOCUMENT_TEXT",
|
||||
"IMAGE_INFO",
|
||||
"NER_ENTITIES",
|
||||
"TABLES",
|
||||
"IMPORTED_REDACTIONS")
|
||||
.stream()
|
||||
.map(FileType::valueOf)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
Set<FileType> uploadedFileTypes = Files.walk(folder)
|
||||
.filter(path -> path.toFile().isFile())
|
||||
.filter(path -> endingsToUpload.contains(parseFileTypeFromPath(path)))
|
||||
.map(filePath -> uploadFile(filePath, request))
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
|
||||
Set<FileType> missingFileTypes = Sets.difference(endingsToUpload, uploadedFileTypes);
|
||||
if (!missingFileTypes.isEmpty()) {
|
||||
log.error("Folder {} is missing files of type {}",
|
||||
folder.toFile(),
|
||||
missingFileTypes.stream()
|
||||
.map(Enum::toString)
|
||||
.collect(Collectors.joining(", ")));
|
||||
throw new NotFoundException("Not all required file types are present.");
|
||||
}
|
||||
return request;
|
||||
}
|
||||
|
||||
|
||||
private static FileType parseFileTypeFromPath(Path path) {
|
||||
|
||||
return FileType.valueOf(path.getFileName().toString().split("\\.")[1]);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private FileType uploadFile(Path path, AnalyzeRequest request) {
|
||||
|
||||
FileType fileType = parseFileTypeFromPath(path);
|
||||
try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) {
|
||||
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType), in);
|
||||
|
||||
}
|
||||
return fileType;
|
||||
}
|
||||
|
||||
|
||||
private class TestDossierTemplate {
|
||||
|
||||
String id;
|
||||
Dictionary testDictionary;
|
||||
AtomicInteger dictEntryIdCounter = new AtomicInteger(0);
|
||||
String rules;
|
||||
String componentRules;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
TestDossierTemplate(Path dossierTemplateToUse) {
|
||||
|
||||
Map<String, Object> dossierTemplate = mapper.readValue(dossierTemplateToUse.resolve("dossierTemplate.json").toFile(), HashMap.class);
|
||||
this.id = (String) dossierTemplate.get("dossierTemplateId");
|
||||
List<DictionaryModel> dictionaries = Files.walk(dossierTemplateToUse)
|
||||
.filter(path -> path.getFileName().toString().equals("dossierType.json"))
|
||||
.map(this::loadDictionaryModel)
|
||||
.toList();
|
||||
File ruleFile = dossierTemplateToUse.resolve("rules.drl").toFile();
|
||||
rules = new String(Files.readAllBytes(ruleFile.toPath()));
|
||||
|
||||
File componentRuleFile = dossierTemplateToUse.resolve("componentRules.drl").toFile();
|
||||
if (componentRuleFile.exists()) {
|
||||
componentRules = new String(Files.readAllBytes(componentRuleFile.toPath()));
|
||||
}
|
||||
|
||||
testDictionary = new Dictionary(dictionaries, new DictionaryVersion(0, 0));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private DictionaryModel loadDictionaryModel(Path path) {
|
||||
|
||||
Map<String, Object> model = mapper.readValue(path.toFile(), HashMap.class);
|
||||
Set<DictionaryEntryModel> entries = new HashSet<>();
|
||||
Set<DictionaryEntryModel> falsePositives = new HashSet<>();
|
||||
Set<DictionaryEntryModel> falseRecommendations = new HashSet<>();
|
||||
|
||||
String type = (String) model.get("type");
|
||||
Integer rank = (Integer) model.get("rank");
|
||||
float[] color = hexToFloatArr((String) model.get("hexColor"));
|
||||
Boolean caseInsensitive = (Boolean) model.get("caseInsensitive");
|
||||
Boolean hint = (Boolean) model.get("hint");
|
||||
Boolean hasDictionary = (Boolean) model.get("hasDictionary");
|
||||
|
||||
boolean isDossierDictionary;
|
||||
if (model.containsKey("dossierDictionaryOnly")) {
|
||||
isDossierDictionary = true;
|
||||
} else {
|
||||
isDossierDictionary = ((String) model.get("id")).split(":").length == 3;
|
||||
}
|
||||
|
||||
if (hasDictionary) {
|
||||
try (var in = new FileInputStream(path.getParent().resolve("entries.txt").toFile())) {
|
||||
entries.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId")));
|
||||
}
|
||||
try (var in = new FileInputStream(path.getParent().resolve("falsePositives.txt").toFile())) {
|
||||
falsePositives.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId")));
|
||||
}
|
||||
try (var in = new FileInputStream(path.getParent().resolve("falseRecommendations.txt").toFile())) {
|
||||
falseRecommendations.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId")));
|
||||
}
|
||||
}
|
||||
|
||||
return new DictionaryModel(type, rank, color, caseInsensitive, hint, entries, falsePositives, falseRecommendations, isDossierDictionary);
|
||||
}
|
||||
|
||||
|
||||
private Set<DictionaryEntryModel> parseDictionaryEntryModelFromFile(String s, AtomicInteger dictEntryIdCounter, String typeId) {
|
||||
|
||||
String[] values = s.split("\n");
|
||||
return Arrays.stream(values)
|
||||
.map(value -> new DictionaryEntryModel(dictEntryIdCounter.getAndIncrement(), value, 0L, false, typeId))
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
private float[] hexToFloatArr(String hexColor) {
|
||||
|
||||
// Remove # symbol if present
|
||||
String cleanHexColor = hexColor.replace("#", "");
|
||||
|
||||
// Parse hex string into RGB components
|
||||
int r = Integer.parseInt(cleanHexColor.substring(0, 2), 16);
|
||||
int g = Integer.parseInt(cleanHexColor.substring(2, 4), 16);
|
||||
int b = Integer.parseInt(cleanHexColor.substring(4, 6), 16);
|
||||
|
||||
// Normalize RGB values to floats between 0 and 1
|
||||
float[] rgbFloat = new float[3];
|
||||
rgbFloat[0] = r / 255.0f;
|
||||
rgbFloat[1] = g / 255.0f;
|
||||
rgbFloat[2] = b / 255.0f;
|
||||
|
||||
return rgbFloat;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
<Configuration>
|
||||
|
||||
<Appenders>
|
||||
<Console name="CONSOLE" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<Root level="warn">
|
||||
<AppenderRef ref="CONSOLE"/>
|
||||
</Root>
|
||||
<Logger name="com.iqser" level="info"/>
|
||||
</Loggers>
|
||||
|
||||
</Configuration>
|
||||
@ -0,0 +1,17 @@
|
||||
<configuration>
|
||||
|
||||
<springProperty scope="configuration" name="logType" source="logging.type"/>
|
||||
<springProperty scope="context" name="application.name" source="spring.application.name"/>
|
||||
<springProperty scope="context" name="version" source="project.version"/>
|
||||
<include resource="org/springframework/boot/logging/logback/defaults.xml"/>
|
||||
<include resource="org/springframework/boot/logging/logback/console-appender.xml"/>
|
||||
|
||||
<appender name="JSON" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<encoder class="net.logstash.logback.encoder.LogstashEncoder"/>
|
||||
</appender>
|
||||
|
||||
<root level="INFO">
|
||||
<appender-ref ref="${logType}"/>
|
||||
</root>
|
||||
|
||||
</configuration>
|
||||
Loading…
x
Reference in New Issue
Block a user