Merge branch 'RED-7384-fp' into 'master'

RED-7384: improve performance significantly

Closes RED-7384

See merge request redactmanager/redaction-service!385
This commit is contained in:
Dominique Eifländer 2024-04-24 11:57:04 +02:00
commit dce2d1b898
12 changed files with 459 additions and 88 deletions

View File

@ -12,7 +12,7 @@ plugins {
description = "redaction-service-server-v1"
val layoutParserVersion = "0.107.0"
val layoutParserVersion = "0.116.0"
val jacksonVersion = "2.15.2"
val droolsVersion = "9.44.0.Final"
val pdfBoxVersion = "3.0.0"

View File

@ -2,6 +2,7 @@ package com.iqser.red.service.redaction.v1.server.model.document;
import static java.lang.String.format;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
@ -122,6 +123,68 @@ public class DocumentTree {
}
/**
* Finds all child nodes of the specified entry, whose nodes textRange intersects the given textRange. It achieves this by finding the first entry, whose textRange contains the start idx of the TextRange using a binary search.
* It then iterates over the remaining children adding them to the intersections, until one does not contain the end of the TextRange. All intersected Entries are returned as SemanticNodes.
*
* @param treeId the treeId of the Entry whose children shall be checked.
* @param textRange The TextRange to find intersecting childNodes for.
* @return A list of all SemanticNodes, that are direct children of the specified Entry, whose TextRange intersects the given TextRange
*/
public List<SemanticNode> findIntersectingChildNodes(List<Integer> treeId, TextRange textRange) {
List<Entry> childEntries = getEntryById(treeId).getChildren();
List<SemanticNode> intersectingChildEntries = new LinkedList<>();
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
if (startIdx < 0) {
return intersectingChildEntries;
}
for (int i = startIdx; i < childEntries.size(); i++) {
if (childEntries.get(i).getNode().getTextRange().start() < textRange.end()) {
intersectingChildEntries.add(childEntries.get(i).getNode());
} else {
break;
}
}
return intersectingChildEntries;
}
public Optional<SemanticNode> findFirstContainingChild(List<Integer> treeId, TextRange textRange) {
List<Entry> childEntries = getEntryById(treeId).getChildren();
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
if (startIdx < 0) {
return Optional.empty();
}
if (childEntries.get(startIdx).getNode().getTextRange().contains(textRange.end())) {
return Optional.of(childEntries.get(startIdx).getNode());
}
return Optional.empty();
}
private int findFirstIdxOfContainingChildBinarySearch(List<Entry> childNodes, int start) {
int low = 0;
int high = childNodes.size() - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
TextRange range = childNodes.get(mid).getNode().getTextRange();
if (range.start() > start) {
high = mid - 1;
} else if (range.end() <= start) {
low = mid + 1;
} else {
return mid;
}
}
return -1;
}
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
return getEntryById(treeId).children.stream()
@ -252,7 +315,7 @@ public class DocumentTree {
List<Integer> treeId;
SemanticNode node;
@Builder.Default
List<Entry> children = new LinkedList<>();
List<Entry> children = new ArrayList<>();
@Override

View File

@ -165,7 +165,8 @@ public class TextRange implements Comparable<TextRange> {
}
List<TextRange> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int splitIndex : splitIndices) {
for (int i = 0, splitIndicesSize = splitIndices.size(); i < splitIndicesSize; i++) {
int splitIndex = splitIndices.get(i);
// skip split if it would produce a boundary of length 0
if (splitIndex == previousIndex) {

View File

@ -47,6 +47,8 @@ public class Image implements GenericSemanticNode, IEntity {
List<Integer> treeId;
String id;
TextBlock leafTextBlock;
ImageType imageType;
boolean transparent;
Rectangle2D position;
@ -57,14 +59,11 @@ public class Image implements GenericSemanticNode, IEntity {
@Builder.Default
ManualChangeOverwrite manualOverwrite = new ManualChangeOverwrite();
@EqualsAndHashCode.Exclude
Page page;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<TextEntity> entities = new HashSet<>();
@ -78,9 +77,7 @@ public class Image implements GenericSemanticNode, IEntity {
@Override
public TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
return leafTextBlock;
}
@ -94,15 +91,21 @@ public class Image implements GenericSemanticNode, IEntity {
@Override
public TextRange getTextRange() {
return GenericSemanticNode.super.getTextRange();
return leafTextBlock.getTextRange();
}
@Override
public int length() {
return getTextRange().length();
}
@Override
public String type() {
return getManualOverwrite().getType()
.orElse(imageType.toString().toLowerCase(Locale.ENGLISH));
return getManualOverwrite().getType().orElse(imageType.toString().toLowerCase(Locale.ENGLISH));
}
@ -160,10 +163,4 @@ public class Image implements GenericSemanticNode, IEntity {
return (area / calculatedIntersection) > containmentThreshold;
}
public int length() {
return 0;
}
}

View File

@ -626,7 +626,7 @@ public interface SemanticNode {
textEntity.setDeepestFullyContainingNode(this);
}
textEntity.addIntersectingNode(this);
streamChildren().filter(semanticNode -> semanticNode.getTextRange().intersects(textEntity.getTextRange()))
getDocumentTree().findIntersectingChildNodes(getTreeId(), textEntity.getTextRange())
.forEach(node -> node.addThisToEntityIfIntersects(textEntity));
}
}
@ -714,8 +714,7 @@ public interface SemanticNode {
if (isLeaf()) {
return getTextBlock().getPositionsPerPage(textRange);
}
Optional<SemanticNode> containingChildNode = streamChildren().filter(child -> child.getTextRange().contains(textRange))
.findFirst();
Optional<SemanticNode> containingChildNode = getDocumentTree().findFirstContainingChild(getTreeId(), textRange);
if (containingChildNode.isEmpty()) {
return getTextBlock().getPositionsPerPage(textRange);
}

View File

@ -17,7 +17,7 @@ public class MessageReceiver {
@RabbitHandler
@RabbitListener(queues = REDACTION_QUEUE)
@RabbitListener(queues = REDACTION_QUEUE, concurrency = "1")
public void receiveAnalyzeRequest(Message message) {
redactionMessageReceiver.receiveAnalyzeRequest(message, false);

View File

@ -17,7 +17,7 @@ public class PriorityMessageReceiver {
@RabbitHandler
@RabbitListener(queues = REDACTION_PRIORITY_QUEUE)
@RabbitListener(queues = REDACTION_PRIORITY_QUEUE, concurrency = "1")
public void receiveAnalyzeRequest(Message message) {
redactionMessageReceiver.receiveAnalyzeRequest(message, true);

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.service.document;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
@ -8,23 +9,23 @@ import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentData;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentData;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
@ -58,7 +59,7 @@ public class DocumentGraphMapper {
private List<DocumentTree.Entry> buildEntries(List<DocumentStructure.EntryData> entries, Context context) {
List<DocumentTree.Entry> newEntries = new LinkedList<>();
List<DocumentTree.Entry> newEntries = new ArrayList<>(entries.size());
for (DocumentStructure.EntryData entryData : entries) {
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
@ -191,8 +192,7 @@ public class DocumentGraphMapper {
return context.pageData.stream()
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
.findFirst()
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
.findFirst().orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
}

View File

@ -1,14 +1,18 @@
package com.iqser.red.service.redaction.v1.server.service.document;
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.*;
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.addEntityToNodeEntitySets;
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.addToPages;
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.allEntitiesIntersectAndHaveSameTypes;
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.checkIfBothStartAndEndAreEmpty;
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.findIntersectingSubNodes;
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.toLineAfterTextRange;
import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.truncateEndIfLineBreakIsBetween;
import static com.iqser.red.service.redaction.v1.server.utils.SeparatorUtils.boundaryIsSurroundedBySeparators;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
@ -985,7 +989,7 @@ public class EntityCreationService {
.peek(e -> e.addEngines(engines))
.findAny();
if (optionalTextEntity.isEmpty()) {
return optionalTextEntity; // Entity has been recategorized and should not be created at all.
return Optional.empty(); // Entity has been recategorized and should not be created at all.
}
TextEntity existingEntity = optionalTextEntity.get();
if (existingEntity.getTextRange().equals(textRange)) {
@ -997,7 +1001,7 @@ public class EntityCreationService {
}
return Optional.empty(); // Entity has been resized, if there are duplicates they should be treated there
}
addEntityToGraph(entity, node);
addEntityToGraph(entity, node.getDocumentTree());
entity.addEngines(engines);
insertToKieSession(entity);
return Optional.of(entity);
@ -1027,15 +1031,16 @@ public class EntityCreationService {
/**
* Merges a list of text entities into a single entity, assuming they intersect and are of the same type.
*
* @param entitiesToMerge The list of entities to merge.
* @param type The type for the merged entity.
* @param entityType The entity's classification.
* @param node The semantic node related to these entities.
* @return A single merged {@link TextEntity}.
* @throws IllegalArgumentException If entities do not intersect or have different types.
* @deprecated Do not use anymore. This might not work correctly due to duplicate textranges not being taken into account here.
* Merges a list of text entities into a single entity, assuming they intersect and are of the same type.
*/
@Deprecated(forRemoval = true)
public TextEntity mergeEntitiesOfSameType(List<TextEntity> entitiesToMerge, String type, EntityType entityType, SemanticNode node) {
if (!allEntitiesIntersectAndHaveSameTypes(entitiesToMerge)) {
@ -1070,6 +1075,8 @@ public class EntityCreationService {
mergedEntity.setDossierDictionaryEntry(entitiesToMerge.stream()
.anyMatch(TextEntity::isDossierDictionaryEntry));
entityEnrichmentService.enrichEntity(mergedEntity, node.getTextBlock());
addEntityToGraph(mergedEntity, node);
insertToKieSession(mergedEntity);
@ -1245,38 +1252,19 @@ public class EntityCreationService {
public void addEntityToGraph(TextEntity entity, SemanticNode node) {
DocumentTree documentTree = node.getDocumentTree();
try {
if (node.getEntities().contains(entity)) {
// If entity already exists and it has a different text range, we add the text range to the list of duplicated text ranges
Optional<TextEntity> optionalTextEntity = node.getEntities()
.stream()//
.filter(e -> e.equals(entity))//
.filter(e -> !e.getTextRange().equals(entity.getTextRange()))//
.findAny();
if (optionalTextEntity.isPresent()) {
addDuplicateEntityToGraph(optionalTextEntity.get(), entity.getTextRange(), node);
} else {
node.getEntities().remove(entity);
addNewEntityToGraph(entity, documentTree);
}
if (node.getEntities().contains(entity)) {
// If entity already exists and it has a different text range, we add the text range to the list of duplicated text ranges
node.getEntities()
.stream()//
.filter(e -> e.equals(entity))//
.filter(e -> !e.getTextRange().equals(entity.getTextRange()))//
.findAny()
.ifPresent(e -> addDuplicateEntityToGraph(e, entity.getTextRange(), node));
} else {
entity.addIntersectingNode(documentTree.getRoot().getNode());
addEntityToGraph(entity, documentTree);
}
} catch (NoSuchElementException e) {
addNewEntityToGraph(entity, documentTree);
} else {
addEntityToGraph(entity, documentTree);
}
}
private void addNewEntityToGraph(TextEntity entity, DocumentTree documentTree) {
entity.setDeepestFullyContainingNode(documentTree.getRoot().getNode());
entityEnrichmentService.enrichEntity(entity, entity.getDeepestFullyContainingNode().getTextBlock());
entity.addIntersectingNode(documentTree.getRoot().getNode());
addToPages(entity);
addEntityToNodeEntitySets(entity);
}
@ -1312,12 +1300,7 @@ public class EntityCreationService {
private void addEntityToGraph(TextEntity entity, DocumentTree documentTree) {
SemanticNode containingNode = documentTree.childNodes(Collections.emptyList())
.filter(node -> node.getTextBlock().containsTextRange(entity.getTextRange()))
.findFirst()
.orElseThrow(() -> new NoSuchElementException("No containing Node found!"));
containingNode.addThisToEntityIfIntersects(entity);
documentTree.getRoot().getNode().addThisToEntityIfIntersects(entity);
TextBlock textBlock = entity.getDeepestFullyContainingNode().getTextBlock();
entityEnrichmentService.enrichEntity(entity, textBlock);

View File

@ -0,0 +1,327 @@
package com.iqser.red.service.redaction.v1.server;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.when;
import java.io.File;
import java.io.FileInputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel;
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
import com.iqser.red.service.persistence.service.v1.api.shared.model.RuleFileType;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.ManualRedactions;
import com.iqser.red.service.persistence.service.v1.api.shared.model.common.JSONPrimitive;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.tenantcommons.TenantsClient;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class)
@Disabled
/*
* This test is meant to be used directly with a download from blob storage (e.g. minio). You need to define the dossier template you want to use by supplying an absolute path.
* The dossier template will then be parsed for dictionaries, colors, entities, and rules. This is defined for the all tests once.
* Inside a test you supply a path to your minio download folder. The files should still be zipped in this folder.
* The files will then be checked for completeness and uploaded to the FileSystemBackedStorageService.
* This way you can recreate what is happening on the stack almost exactly.
*/ public class AnalysisEnd2EndTest {
Path dossierTemplateToUse = Path.of("/home/kschuettler/iqser/business-logic/redactmanager/prod-cp-eu-reg/EFSA_sanitisation_GFL_v1"); // Add your dossier-template here
ObjectMapper mapper = ObjectMapperFactory.create();
final String TENANT_ID = "tenant";
@Autowired
StorageService storageService;
@Autowired
protected AnalyzeService analyzeService;
@MockBean
DictionaryService dictionaryService;
@MockBean
RabbitTemplate rabbitTemplate;
TestDossierTemplate testDossierTemplate;
@MockBean
protected LegalBasisClient legalBasisClient;
@MockBean
private TenantsClient tenantsClient;
@MockBean
protected RulesClient rulesClient;
@MockBean
protected DictionaryClient dictionaryClient;
@Test
@SneakyThrows
public void runAnalysisEnd2End() {
String folder = "files/end2end/file0"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
Path absoluteFolderPath;
if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path
ClassPathResource classPathResource = new ClassPathResource(folder);
absoluteFolderPath = classPathResource.getFile().toPath();
} else {
absoluteFolderPath = Path.of(folder);
}
log.info("Starting end2end analyses for all distinct filenames in folder: {}", folder);
List<AnalyzeRequest> analyzeRequests = prepareStorageForFolder(absoluteFolderPath);
log.info("Found {} distinct fileIds", analyzeRequests.size());
for (int i = 0; i < analyzeRequests.size(); i++) {
AnalyzeRequest analyzeRequest = analyzeRequests.get(i);
log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId());
analyzeService.analyze(analyzeRequest);
}
}
@BeforeEach
public void setup() {
testDossierTemplate = new TestDossierTemplate(dossierTemplateToUse);
when(dictionaryService.updateDictionary(any(), any())).thenReturn(new DictionaryVersion(0, 0));
when(dictionaryService.getDeepCopyDictionary(any(), any())).thenReturn(testDossierTemplate.testDictionary);
when(dictionaryService.getDictionaryIncrements(any(), any(), any())).thenReturn(new DictionaryIncrement(Collections.emptySet(), new DictionaryVersion(0, 0)));
when(dictionaryService.isHint(any(String.class), any())).thenAnswer(invocation -> {
String type = invocation.getArgument(0);
return testDossierTemplate.testDictionary.getType(type).isHint();
});
when(dictionaryService.getColor(any(String.class), any())).thenAnswer(invocation -> {
String type = invocation.getArgument(0);
return testDossierTemplate.testDictionary.getType(type).getColor();
});
when(dictionaryService.getNotRedactedColor(any())).thenReturn(new float[]{0.2f, 0.2f, 0.2f});
when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(System.currentTimeMillis());
when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(JSONPrimitive.of(testDossierTemplate.rules));
when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.COMPONENT)).thenReturn(testDossierTemplate.componentRules != null ? System.currentTimeMillis() : -1);
when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.COMPONENT)).thenReturn(JSONPrimitive.of(testDossierTemplate.componentRules));
}
@SneakyThrows
private List<AnalyzeRequest> prepareStorageForFolder(Path folder) {
return Files.list(folder)
.map(this::parseFileId)
.distinct()
.map(fileId -> prepareStorageForFile(fileId, folder))
.toList();
}
private String parseFileId(Path path) {
return path.getFileName().toString().split("\\.")[0];
}
@SneakyThrows
private AnalyzeRequest prepareStorageForFile(String fileId, Path folder) {
AnalyzeRequest request = new AnalyzeRequest();
request.setDossierId(UUID.randomUUID().toString());
request.setFileId(UUID.randomUUID().toString());
request.setDossierTemplateId(testDossierTemplate.id);
request.setManualRedactions(new ManualRedactions());
request.setAnalysisNumber(-1);
Set<FileType> endingsToUpload = Set.of("ORIGIN",
"DOCUMENT_PAGES",
"DOCUMENT_POSITION",
"DOCUMENT_STRUCTURE",
"DOCUMENT_TEXT",
"IMAGE_INFO",
"NER_ENTITIES",
"TABLES",
"IMPORTED_REDACTIONS")
.stream()
.map(FileType::valueOf)
.collect(Collectors.toSet());
Set<FileType> uploadedFileTypes = Files.walk(folder)
.filter(path -> path.toFile().isFile())
.filter(path -> endingsToUpload.contains(parseFileTypeFromPath(path)))
.map(filePath -> uploadFile(filePath, request))
.collect(Collectors.toUnmodifiableSet());
Set<FileType> missingFileTypes = Sets.difference(endingsToUpload, uploadedFileTypes);
if (!missingFileTypes.isEmpty()) {
log.error("Folder {} is missing files of type {}",
folder.toFile(),
missingFileTypes.stream()
.map(Enum::toString)
.collect(Collectors.joining(", ")));
throw new NotFoundException("Not all required file types are present.");
}
return request;
}
private static FileType parseFileTypeFromPath(Path path) {
return FileType.valueOf(path.getFileName().toString().split("\\.")[1]);
}
@SneakyThrows
private FileType uploadFile(Path path, AnalyzeRequest request) {
FileType fileType = parseFileTypeFromPath(path);
try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) {
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType), in);
}
return fileType;
}
private class TestDossierTemplate {
String id;
Dictionary testDictionary;
AtomicInteger dictEntryIdCounter = new AtomicInteger(0);
String rules;
String componentRules;
@SneakyThrows
TestDossierTemplate(Path dossierTemplateToUse) {
Map<String, Object> dossierTemplate = mapper.readValue(dossierTemplateToUse.resolve("dossierTemplate.json").toFile(), HashMap.class);
this.id = (String) dossierTemplate.get("dossierTemplateId");
List<DictionaryModel> dictionaries = Files.walk(dossierTemplateToUse)
.filter(path -> path.getFileName().toString().equals("dossierType.json"))
.map(this::loadDictionaryModel)
.toList();
File ruleFile = dossierTemplateToUse.resolve("rules.drl").toFile();
rules = new String(Files.readAllBytes(ruleFile.toPath()));
File componentRuleFile = dossierTemplateToUse.resolve("componentRules.drl").toFile();
if (componentRuleFile.exists()) {
componentRules = new String(Files.readAllBytes(componentRuleFile.toPath()));
}
testDictionary = new Dictionary(dictionaries, new DictionaryVersion(0, 0));
}
@SneakyThrows
private DictionaryModel loadDictionaryModel(Path path) {
Map<String, Object> model = mapper.readValue(path.toFile(), HashMap.class);
Set<DictionaryEntryModel> entries = new HashSet<>();
Set<DictionaryEntryModel> falsePositives = new HashSet<>();
Set<DictionaryEntryModel> falseRecommendations = new HashSet<>();
String type = (String) model.get("type");
Integer rank = (Integer) model.get("rank");
float[] color = hexToFloatArr((String) model.get("hexColor"));
Boolean caseInsensitive = (Boolean) model.get("caseInsensitive");
Boolean hint = (Boolean) model.get("hint");
Boolean hasDictionary = (Boolean) model.get("hasDictionary");
boolean isDossierDictionary;
if (model.containsKey("dossierDictionaryOnly")) {
isDossierDictionary = true;
} else {
isDossierDictionary = ((String) model.get("id")).split(":").length == 3;
}
if (hasDictionary) {
try (var in = new FileInputStream(path.getParent().resolve("entries.txt").toFile())) {
entries.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId")));
}
try (var in = new FileInputStream(path.getParent().resolve("falsePositives.txt").toFile())) {
falsePositives.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId")));
}
try (var in = new FileInputStream(path.getParent().resolve("falseRecommendations.txt").toFile())) {
falseRecommendations.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId")));
}
}
return new DictionaryModel(type, rank, color, caseInsensitive, hint, entries, falsePositives, falseRecommendations, isDossierDictionary);
}
private Set<DictionaryEntryModel> parseDictionaryEntryModelFromFile(String s, AtomicInteger dictEntryIdCounter, String typeId) {
String[] values = s.split("\n");
return Arrays.stream(values)
.map(value -> new DictionaryEntryModel(dictEntryIdCounter.getAndIncrement(), value, 0L, false, typeId))
.collect(Collectors.toUnmodifiableSet());
}
private float[] hexToFloatArr(String hexColor) {
// Remove # symbol if present
String cleanHexColor = hexColor.replace("#", "");
// Parse hex string into RGB components
int r = Integer.parseInt(cleanHexColor.substring(0, 2), 16);
int g = Integer.parseInt(cleanHexColor.substring(2, 4), 16);
int b = Integer.parseInt(cleanHexColor.substring(4, 6), 16);
// Normalize RGB values to floats between 0 and 1
float[] rgbFloat = new float[3];
rgbFloat[0] = r / 255.0f;
rgbFloat[1] = g / 255.0f;
rgbFloat[2] = b / 255.0f;
return rgbFloat;
}
}
}

View File

@ -1,16 +0,0 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="warn">
<AppenderRef ref="CONSOLE"/>
</Root>
<Logger name="com.iqser" level="info"/>
</Loggers>
</Configuration>

View File

@ -0,0 +1,17 @@
<configuration>
<springProperty scope="configuration" name="logType" source="logging.type"/>
<springProperty scope="context" name="application.name" source="spring.application.name"/>
<springProperty scope="context" name="version" source="project.version"/>
<include resource="org/springframework/boot/logging/logback/defaults.xml"/>
<include resource="org/springframework/boot/logging/logback/console-appender.xml"/>
<appender name="JSON" class="ch.qos.logback.core.ConsoleAppender">
<encoder class="net.logstash.logback.encoder.LogstashEncoder"/>
</appender>
<root level="INFO">
<appender-ref ref="${logType}"/>
</root>
</configuration>