diff --git a/redaction-service-v1/redaction-service-server-v1/build.gradle.kts b/redaction-service-v1/redaction-service-server-v1/build.gradle.kts index 93ac5f0e..b8380366 100644 --- a/redaction-service-v1/redaction-service-server-v1/build.gradle.kts +++ b/redaction-service-v1/redaction-service-server-v1/build.gradle.kts @@ -12,7 +12,7 @@ plugins { description = "redaction-service-server-v1" -val layoutParserVersion = "0.89.3" +val layoutParserVersion = "0.89.11" val jacksonVersion = "2.15.2" val droolsVersion = "9.44.0.Final" val pdfBoxVersion = "3.0.0" diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/DocumentTree.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/DocumentTree.java index 951a64ef..8527cff0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/DocumentTree.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/DocumentTree.java @@ -2,6 +2,7 @@ package com.iqser.red.service.redaction.v1.server.model.document; import static java.lang.String.format; +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -40,7 +41,10 @@ public class DocumentTree { public TextBlock buildTextBlock() { - return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + return allEntriesInOrder().map(Entry::getNode) + .filter(SemanticNode::isLeaf) + .map(SemanticNode::getLeafTextBlock) + .collect(new TextBlockCollector()); } @@ -114,13 +118,78 @@ public class DocumentTree { public Stream childNodes(List treeId) { - return getEntryById(treeId).children.stream().map(Entry::getNode); + return getEntryById(treeId).children.stream() + .map(Entry::getNode); + } + + + /** + * Finds all child nodes of the specified entry, whose nodes textRange intersects the given textRange. It achieves this by finding the first entry, whose textRange contains the start idx of the TextRange using a binary search. + * It then iterates over the remaining children adding them to the intersections, until one does not contain the end of the TextRange. All intersected Entries are returned as SemanticNodes. + * + * @param treeId the treeId of the Entry whose children shall be checked. + * @param textRange The TextRange to find intersecting childNodes for. + * @return A list of all SemanticNodes, that are direct children of the specified Entry, whose TextRange intersects the given TextRange + */ + public List findIntersectingChildNodes(List treeId, TextRange textRange) { + + List childEntries = getEntryById(treeId).getChildren(); + List intersectingChildEntries = new LinkedList<>(); + int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start()); + if (startIdx < 0) { + return intersectingChildEntries; + } + for (int i = startIdx; i < childEntries.size(); i++) { + if (childEntries.get(i).getNode().getTextRange().start() < textRange.end()) { + intersectingChildEntries.add(childEntries.get(i).getNode()); + } else { + break; + } + } + return intersectingChildEntries; + } + + + public Optional findFirstContainingChild(List treeId, TextRange textRange) { + + List childEntries = getEntryById(treeId).getChildren(); + int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start()); + if (startIdx < 0) { + return Optional.empty(); + } + + if (childEntries.get(startIdx).getNode().getTextRange().contains(textRange.end())) { + return Optional.of(childEntries.get(startIdx).getNode()); + } + + return Optional.empty(); + } + + + private int findFirstIdxOfContainingChildBinarySearch(List childNodes, int start) { + + int low = 0; + int high = childNodes.size() - 1; + while (low <= high) { + int mid = low + (high - low) / 2; + TextRange range = childNodes.get(mid).getNode().getTextRange(); + if (range.start() > start) { + high = mid - 1; + } else if (range.end() <= start) { + low = mid + 1; + } else { + return mid; + } + } + return -1; } public Stream childNodesOfType(List treeId, NodeType nodeType) { - return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode); + return getEntryById(treeId).children.stream() + .filter(entry -> entry.node.getType().equals(nodeType)) + .map(Entry::getNode); } @@ -199,26 +268,32 @@ public class DocumentTree { public Stream allEntriesInOrder() { - return Stream.of(root).flatMap(DocumentTree::flatten); + return Stream.of(root) + .flatMap(DocumentTree::flatten); } public Stream allSubEntriesInOrder(List parentId) { - return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten); + return getEntryById(parentId).children.stream() + .flatMap(DocumentTree::flatten); } @Override public String toString() { - return String.join("\n", allEntriesInOrder().map(Entry::toString).toList()); + return String.join("\n", + allEntriesInOrder().map(Entry::toString) + .toList()); } private static Stream flatten(Entry entry) { - return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten)); + return Stream.concat(Stream.of(entry), + entry.children.stream() + .flatMap(DocumentTree::flatten)); } @@ -240,7 +315,7 @@ public class DocumentTree { List treeId; SemanticNode node; @Builder.Default - List children = new LinkedList<>(); + List children = new ArrayList<>(); @Override diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/TextRange.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/TextRange.java index 9ec00087..43b3861e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/TextRange.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/TextRange.java @@ -92,12 +92,18 @@ public class TextRange implements Comparable { public List split(List splitIndices) { - if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) { - throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this)); + if (splitIndices.stream() + .anyMatch(idx -> !this.contains(idx))) { + throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", + splitIndices.stream() + .filter(idx -> !this.contains(idx)) + .toList(), + this)); } List splitBoundaries = new LinkedList<>(); int previousIndex = start; - for (int splitIndex : splitIndices) { + for (int i = 0, splitIndicesSize = splitIndices.size(); i < splitIndicesSize; i++) { + int splitIndex = splitIndices.get(i); // skip split if it would produce a boundary of length 0 if (splitIndex == previousIndex) { @@ -113,8 +119,12 @@ public class TextRange implements Comparable { public static TextRange merge(Collection boundaries) { - int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new); - int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new); + int minStart = boundaries.stream() + .mapToInt(TextRange::start) + .min().orElseThrow(IllegalArgumentException::new); + int maxEnd = boundaries.stream() + .mapToInt(TextRange::end) + .max().orElseThrow(IllegalArgumentException::new); return new TextRange(minStart, maxEnd); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java index 0a3243fb..04aee944 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Image.java @@ -17,7 +17,6 @@ import com.iqser.red.service.redaction.v1.server.model.document.entity.ManualCha import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule; import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; -import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -39,6 +38,8 @@ public class Image implements GenericSemanticNode, IEntity { List treeId; String id; + TextBlock leafTextBlock; + ImageType imageType; boolean transparent; Rectangle2D position; @@ -49,14 +50,11 @@ public class Image implements GenericSemanticNode, IEntity { @Builder.Default ManualChangeOverwrite manualOverwrite = new ManualChangeOverwrite(); - @EqualsAndHashCode.Exclude Page page; - @EqualsAndHashCode.Exclude DocumentTree documentTree; @Builder.Default - @EqualsAndHashCode.Exclude Set entities = new HashSet<>(); @@ -70,9 +68,7 @@ public class Image implements GenericSemanticNode, IEntity { @Override public TextBlock getTextBlock() { - return streamAllSubNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock) - .collect(new TextBlockCollector()); + return leafTextBlock; } @@ -86,15 +82,21 @@ public class Image implements GenericSemanticNode, IEntity { @Override public TextRange getTextRange() { - return GenericSemanticNode.super.getTextRange(); + return leafTextBlock.getTextRange(); + } + + + @Override + public int length() { + + return getTextRange().length(); } @Override public String type() { - return getManualOverwrite().getType() - .orElse(imageType.toString().toLowerCase(Locale.ENGLISH)); + return getManualOverwrite().getType().orElse(imageType.toString().toLowerCase(Locale.ENGLISH)); } @@ -126,10 +128,4 @@ public class Image implements GenericSemanticNode, IEntity { return name.charAt(0) + name.substring(1).toLowerCase(Locale.ENGLISH); } - - public int length() { - - return 0; - } - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java index d874ea24..5d1d52f3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java @@ -70,7 +70,9 @@ public interface SemanticNode { */ default Page getFirstPage() { - return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(); + return getTextBlock().getPages() + .stream() + .min(Comparator.comparingInt(Page::getNumber)).orElseThrow(); } @@ -96,7 +98,8 @@ public interface SemanticNode { */ default boolean onPage(int pageNumber) { - return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber); + return getPages().stream() + .anyMatch(page -> page.getNumber() == pageNumber); } @@ -248,7 +251,9 @@ public interface SemanticNode { */ default boolean hasEntitiesOfType(String type) { - return getEntities().stream().filter(TextEntity::active).anyMatch(redactionEntity -> redactionEntity.type().equals(type)); + return getEntities().stream() + .filter(TextEntity::active) + .anyMatch(redactionEntity -> redactionEntity.type().equals(type)); } @@ -261,7 +266,10 @@ public interface SemanticNode { */ default boolean hasEntitiesOfAnyType(String... types) { - return getEntities().stream().filter(TextEntity::active).anyMatch(redactionEntity -> Arrays.stream(types).anyMatch(type -> redactionEntity.type().equals(type))); + return getEntities().stream() + .filter(TextEntity::active) + .anyMatch(redactionEntity -> Arrays.stream(types) + .anyMatch(type -> redactionEntity.type().equals(type))); } @@ -274,7 +282,12 @@ public interface SemanticNode { */ default boolean hasEntitiesOfAllTypes(String... types) { - return getEntities().stream().filter(TextEntity::active).map(TextEntity::type).collect(Collectors.toUnmodifiableSet()).containsAll(Arrays.stream(types).toList()); + return getEntities().stream() + .filter(TextEntity::active) + .map(TextEntity::type) + .collect(Collectors.toUnmodifiableSet()) + .containsAll(Arrays.stream(types) + .toList()); } @@ -287,7 +300,10 @@ public interface SemanticNode { */ default List getEntitiesOfType(String type) { - return getEntities().stream().filter(TextEntity::active).filter(redactionEntity -> redactionEntity.type().equals(type)).toList(); + return getEntities().stream() + .filter(TextEntity::active) + .filter(redactionEntity -> redactionEntity.type().equals(type)) + .toList(); } @@ -300,7 +316,10 @@ public interface SemanticNode { */ default List getEntitiesOfType(List types) { - return getEntities().stream().filter(TextEntity::active).filter(redactionEntity -> redactionEntity.isAnyType(types)).toList(); + return getEntities().stream() + .filter(TextEntity::active) + .filter(redactionEntity -> redactionEntity.isAnyType(types)) + .toList(); } @@ -313,7 +332,11 @@ public interface SemanticNode { */ default List getEntitiesOfType(String... types) { - return getEntities().stream().filter(TextEntity::active).filter(redactionEntity -> redactionEntity.isAnyType(Arrays.stream(types).toList())).toList(); + return getEntities().stream() + .filter(TextEntity::active) + .filter(redactionEntity -> redactionEntity.isAnyType(Arrays.stream(types) + .toList())) + .toList(); } @@ -365,7 +388,8 @@ public interface SemanticNode { */ default boolean containsAllStrings(String... strings) { - return Arrays.stream(strings).allMatch(this::containsString); + return Arrays.stream(strings) + .allMatch(this::containsString); } @@ -377,7 +401,8 @@ public interface SemanticNode { */ default boolean containsAnyString(String... strings) { - return Arrays.stream(strings).anyMatch(this::containsString); + return Arrays.stream(strings) + .anyMatch(this::containsString); } @@ -389,7 +414,8 @@ public interface SemanticNode { */ default boolean containsAnyString(List strings) { - return strings.stream().anyMatch(this::containsString); + return strings.stream() + .anyMatch(this::containsString); } @@ -413,7 +439,8 @@ public interface SemanticNode { */ default boolean containsAnyStringIgnoreCase(String... strings) { - return Arrays.stream(strings).anyMatch(this::containsStringIgnoreCase); + return Arrays.stream(strings) + .anyMatch(this::containsStringIgnoreCase); } @@ -425,7 +452,8 @@ public interface SemanticNode { */ default boolean containsAllStringsIgnoreCase(String... strings) { - return Arrays.stream(strings).allMatch(this::containsStringIgnoreCase); + return Arrays.stream(strings) + .allMatch(this::containsStringIgnoreCase); } @@ -437,7 +465,9 @@ public interface SemanticNode { */ default boolean containsWord(String word) { - return getTextBlock().getWords().stream().anyMatch(s -> s.equals(word)); + return getTextBlock().getWords() + .stream() + .anyMatch(s -> s.equals(word)); } @@ -449,7 +479,10 @@ public interface SemanticNode { */ default boolean containsWordIgnoreCase(String word) { - return getTextBlock().getWords().stream().map(String::toLowerCase).anyMatch(s -> s.equals(word.toLowerCase(Locale.ENGLISH))); + return getTextBlock().getWords() + .stream() + .map(String::toLowerCase) + .anyMatch(s -> s.equals(word.toLowerCase(Locale.ENGLISH))); } @@ -461,7 +494,10 @@ public interface SemanticNode { */ default boolean containsAnyWord(String... words) { - return Arrays.stream(words).anyMatch(word -> getTextBlock().getWords().stream().anyMatch(word::equals)); + return Arrays.stream(words) + .anyMatch(word -> getTextBlock().getWords() + .stream() + .anyMatch(word::equals)); } @@ -473,7 +509,12 @@ public interface SemanticNode { */ default boolean containsAnyWordIgnoreCase(String... words) { - return Arrays.stream(words).map(String::toLowerCase).anyMatch(word -> getTextBlock().getWords().stream().map(String::toLowerCase).anyMatch(word::equals)); + return Arrays.stream(words) + .map(String::toLowerCase) + .anyMatch(word -> getTextBlock().getWords() + .stream() + .map(String::toLowerCase) + .anyMatch(word::equals)); } @@ -485,7 +526,10 @@ public interface SemanticNode { */ default boolean containsAllWords(String... words) { - return Arrays.stream(words).allMatch(word -> getTextBlock().getWords().stream().anyMatch(word::equals)); + return Arrays.stream(words) + .allMatch(word -> getTextBlock().getWords() + .stream() + .anyMatch(word::equals)); } @@ -497,7 +541,12 @@ public interface SemanticNode { */ default boolean containsAllWordsIgnoreCase(String... words) { - return Arrays.stream(words).map(String::toLowerCase).allMatch(word -> getTextBlock().getWords().stream().map(String::toLowerCase).anyMatch(word::equals)); + return Arrays.stream(words) + .map(String::toLowerCase) + .allMatch(word -> getTextBlock().getWords() + .stream() + .map(String::toLowerCase) + .anyMatch(word::equals)); } @@ -537,7 +586,11 @@ public interface SemanticNode { */ default boolean intersectsRectangle(int x, int y, int w, int h, int pageNumber) { - return getBBox().entrySet().stream().filter(entry -> entry.getKey().getNumber() == pageNumber).map(Map.Entry::getValue).anyMatch(rect -> rect.intersects(x, y, w, h)); + return getBBox().entrySet() + .stream() + .filter(entry -> entry.getKey().getNumber() == pageNumber) + .map(Map.Entry::getValue) + .anyMatch(rect -> rect.intersects(x, y, w, h)); } @@ -556,7 +609,7 @@ public interface SemanticNode { } textEntity.addIntersectingNode(this); - streamChildren().filter(semanticNode -> semanticNode.getTextRange().intersects(textEntity.getTextRange())) + getDocumentTree().findIntersectingChildNodes(getTreeId(), textEntity.getTextRange()) .forEach(node -> node.addThisToEntityIfIntersects(textEntity)); } } @@ -591,7 +644,8 @@ public interface SemanticNode { */ default Stream streamAllSubNodes() { - return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode); + return getDocumentTree().allSubEntriesInOrder(getTreeId()) + .map(DocumentTree.Entry::getNode); } @@ -602,7 +656,9 @@ public interface SemanticNode { */ default Stream streamAllSubNodesOfType(NodeType nodeType) { - return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode); + return getDocumentTree().allSubEntriesInOrder(getTreeId()) + .filter(entry -> entry.getType().equals(nodeType)) + .map(DocumentTree.Entry::getNode); } @@ -641,7 +697,8 @@ public interface SemanticNode { if (isLeaf()) { return getTextBlock().getPositionsPerPage(textRange); } - Optional containingChildNode = streamChildren().filter(child -> child.getTextRange().contains(textRange)).findFirst(); + Optional containingChildNode = getDocumentTree().findFirstContainingChild(getTreeId(), textRange); + if (containingChildNode.isEmpty()) { return getTextBlock().getPositionsPerPage(textRange); } @@ -691,8 +748,12 @@ public interface SemanticNode { private Map getBBoxFromChildren() { Map bBoxPerPage = new HashMap<>(); - List> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList(); - Set pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet()); + List> childrenBBoxes = streamChildren().map(SemanticNode::getBBox) + .toList(); + Set pages = childrenBBoxes.stream() + .flatMap(map -> map.keySet() + .stream()) + .collect(Collectors.toSet()); for (Page page : pages) { Rectangle2D bBoxOnPage = childrenBBoxes.stream() .filter(childBboxPerPage -> childBboxPerPage.containsKey(page)) @@ -710,7 +771,9 @@ public interface SemanticNode { private Map getBBoxFromLeafTextBlock() { Map bBoxPerPage = new HashMap<>(); - Map> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage)); + Map> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks() + .stream() + .collect(Collectors.groupingBy(AtomicTextBlock::getPage)); atomicTextBlockPerPage.forEach((page, atomicTextBlocks) -> bBoxPerPage.put(page, RectangleTransformations.atomicTextBlockBBox(atomicTextBlocks))); return bBoxPerPage; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java index 3578128b..61233875 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.service.document; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.LinkedList; @@ -7,22 +8,22 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import com.iqser.red.service.redaction.v1.server.model.document.DocumentData; import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table; import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell; import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector; -import com.iqser.red.service.redaction.v1.server.model.document.DocumentData; -import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; -import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline; -import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; @@ -39,7 +40,9 @@ public class DocumentGraphMapper { DocumentTree documentTree = new DocumentTree(document); Context context = new Context(documentData, documentTree); - context.pageData.addAll(Arrays.stream(documentData.getDocumentPages()).map(DocumentGraphMapper::buildPage).toList()); + context.pageData.addAll(Arrays.stream(documentData.getDocumentPages()) + .map(DocumentGraphMapper::buildPage) + .toList()); context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context)); @@ -54,10 +57,12 @@ public class DocumentGraphMapper { private List buildEntries(List entries, Context context) { - List newEntries = new LinkedList<>(); + List newEntries = new ArrayList<>(entries.size()); for (DocumentStructure.EntryData entryData : entries) { - List pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList(); + List pages = Arrays.stream(entryData.getPageNumbers()) + .map(pageNumber -> getPage(pageNumber, context)) + .toList(); SemanticNode node = switch (entryData.getType()) { case SECTION -> buildSection(context); @@ -75,7 +80,8 @@ public class DocumentGraphMapper { TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node); node.setLeafTextBlock(textBlock); } - List treeId = Arrays.stream(entryData.getTreeId()).boxed().toList(); + List treeId = Arrays.stream(entryData.getTreeId()).boxed() + .toList(); node.setTreeId(treeId); switch (entryData.getType()) { @@ -148,16 +154,18 @@ public class DocumentGraphMapper { private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) { - return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector()); + return Arrays.stream(atomicTextBlockIds) + .map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)) + .collect(new TextBlockCollector()); } private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) { return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)), - context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)), - parent, - getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); + context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)), + parent, + getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); } @@ -171,8 +179,7 @@ public class DocumentGraphMapper { return context.pageData.stream() .filter(page -> page.getNumber() == Math.toIntExact(pageIndex)) - .findFirst() - .orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex))); + .findFirst().orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex))); } @@ -188,8 +195,10 @@ public class DocumentGraphMapper { this.documentTree = documentTree; this.pageData = new LinkedList<>(); - this.documentTextData = Arrays.stream(documentData.getDocumentTextData()).toList(); - this.documentPositionData = Arrays.stream(documentData.getDocumentPositionData()).toList(); + this.documentTextData = Arrays.stream(documentData.getDocumentTextData()) + .toList(); + this.documentPositionData = Arrays.stream(documentData.getDocumentPositionData()) + .toList(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityCreationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityCreationService.java index b7a09411..2dc5c3c1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityCreationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityCreationService.java @@ -1,14 +1,18 @@ package com.iqser.red.service.redaction.v1.server.service.document; -import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.*; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.addEntityToNodeEntitySets; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.addToPages; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.allEntitiesIntersectAndHaveSameTypes; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.checkIfBothStartAndEndAreEmpty; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.findIntersectingSubNodes; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.toLineAfterTextRange; +import static com.iqser.red.service.redaction.v1.server.service.document.EntityCreationUtility.truncateEndIfLineBreakIsBetween; import static com.iqser.red.service.redaction.v1.server.utils.SeparatorUtils.boundaryIsSurroundedBySeparators; import java.util.Collection; -import java.util.Collections; import java.util.Comparator; import java.util.LinkedList; import java.util.List; -import java.util.NoSuchElementException; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -276,7 +280,8 @@ public class EntityCreationService { "this is some text. a here is more text" and "here is more text". We only want to keep the latter. */ return entityTextRanges.stream() - .filter(boundary -> entityTextRanges.stream().noneMatch(innerBoundary -> !innerBoundary.equals(boundary) && innerBoundary.containedBy(boundary))) + .filter(boundary -> entityTextRanges.stream() + .noneMatch(innerBoundary -> !innerBoundary.equals(boundary) && innerBoundary.containedBy(boundary))) .toList(); } @@ -351,10 +356,10 @@ public class EntityCreationService { return tableNode.streamTableCells() .flatMap(tableCell -> lineAfterBoundariesAcrossColumns(RedactionSearchUtility.findTextRangesByString(string, tableCell.getTextBlock()), - tableCell, - type, - entityType, - tableNode)); + tableCell, + type, + entityType, + tableNode)); } @@ -362,10 +367,10 @@ public class EntityCreationService { return tableNode.streamTableCells() .flatMap(tableCell -> lineAfterBoundariesAcrossColumns(RedactionSearchUtility.findTextRangesByStringIgnoreCase(string, tableCell.getTextBlock()), - tableCell, - type, - entityType, - tableNode)); + tableCell, + type, + entityType, + tableNode)); } @@ -500,7 +505,10 @@ public class EntityCreationService { public Stream bySemanticNodeParagraphsOnly(SemanticNode node, String type, EntityType entityType) { - return node.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(semanticNode -> bySemanticNode(semanticNode, type, entityType)).filter(Optional::isPresent).map(Optional::get); + return node.streamAllSubNodesOfType(NodeType.PARAGRAPH) + .map(semanticNode -> bySemanticNode(semanticNode, type, entityType)) + .filter(Optional::isPresent) + .map(Optional::get); } @@ -590,14 +598,18 @@ public class EntityCreationService { throw new IllegalArgumentException(String.format("%s is not in the %s of the provided semantic node %s", textRange, node.getTextRange(), node)); } TextRange trimmedTextRange = textRange.trim(node.getTextBlock()); - if (trimmedTextRange.length() == 0){ + if (trimmedTextRange.length() == 0) { return Optional.empty(); } TextEntity entity = TextEntity.initialEntityNode(trimmedTextRange, type, entityType, node); if (node.getEntities().contains(entity)) { - Optional optionalTextEntity = node.getEntities().stream().filter(e -> e.equals(entity) && e.type().equals(type)).peek(e -> e.addEngines(engines)).findAny(); + Optional optionalTextEntity = node.getEntities() + .stream() + .filter(e -> e.equals(entity) && e.type().equals(type)) + .peek(e -> e.addEngines(engines)) + .findAny(); if (optionalTextEntity.isEmpty()) { - return optionalTextEntity; // Entity has been recategorized and should not be created at all. + return Optional.empty(); // Entity has been recategorized and should not be created at all. } TextEntity existingEntity = optionalTextEntity.get(); if (existingEntity.getTextRange().equals(textRange)) { @@ -609,7 +621,7 @@ public class EntityCreationService { } return Optional.empty(); // Entity has been resized, if there are duplicates they should be treated there } - addEntityToGraph(entity, node); + addEntityToGraph(entity, node.getDocumentTree()); entity.addEngines(engines); insertToKieSession(entity); return Optional.of(entity); @@ -638,6 +650,8 @@ public class EntityCreationService { } + // Do not use anymore. This might not work correctly due to duplicate textranges not being taken into account here. + @Deprecated(forRemoval = true) public TextEntity mergeEntitiesOfSameType(List entitiesToMerge, String type, EntityType entityType, SemanticNode node) { if (!allEntitiesIntersectAndHaveSameTypes(entitiesToMerge)) { @@ -650,29 +664,44 @@ public class EntityCreationService { return entitiesToMerge.get(0); } - TextEntity mergedEntity = TextEntity.initialEntityNode(TextRange.merge(entitiesToMerge.stream().map(TextEntity::getTextRange).toList()), type, entityType, node); - mergedEntity.addEngines(entitiesToMerge.stream().flatMap(entityNode -> entityNode.getEngines().stream()).collect(Collectors.toSet())); - entitiesToMerge.stream().map(TextEntity::getMatchedRuleList).flatMap(Collection::stream).forEach(matchedRule -> mergedEntity.getMatchedRuleList().add(matchedRule)); + TextEntity mergedEntity = TextEntity.initialEntityNode(TextRange.merge(entitiesToMerge.stream() + .map(TextEntity::getTextRange) + .toList()), type, entityType, node); + mergedEntity.addEngines(entitiesToMerge.stream() + .flatMap(entityNode -> entityNode.getEngines() + .stream()) + .collect(Collectors.toSet())); + entitiesToMerge.stream() + .map(TextEntity::getMatchedRuleList) + .flatMap(Collection::stream) + .forEach(matchedRule -> mergedEntity.getMatchedRuleList().add(matchedRule)); entitiesToMerge.stream() .map(TextEntity::getManualOverwrite) .map(ManualChangeOverwrite::getManualChangeLog) .flatMap(Collection::stream) .forEach(manualChange -> mergedEntity.getManualOverwrite().addChange(manualChange)); - mergedEntity.setDictionaryEntry(entitiesToMerge.stream().anyMatch(TextEntity::isDictionaryEntry)); - mergedEntity.setDossierDictionaryEntry(entitiesToMerge.stream().anyMatch(TextEntity::isDossierDictionaryEntry)); + mergedEntity.setDictionaryEntry(entitiesToMerge.stream() + .anyMatch(TextEntity::isDictionaryEntry)); + mergedEntity.setDossierDictionaryEntry(entitiesToMerge.stream() + .anyMatch(TextEntity::isDossierDictionaryEntry)); + + entityEnrichmentService.enrichEntity(mergedEntity, node.getTextBlock()); addEntityToGraph(mergedEntity, node); insertToKieSession(mergedEntity); - entitiesToMerge.stream().filter(e -> !e.equals(mergedEntity)).forEach(node.getEntities()::remove); + entitiesToMerge.stream() + .filter(e -> !e.equals(mergedEntity)) + .forEach(node.getEntities()::remove); return mergedEntity; } public Stream copyEntities(List entities, String type, EntityType entityType, SemanticNode node) { - return entities.stream().map(entity -> copyEntity(entity, type, entityType, node)); + return entities.stream() + .map(entity -> copyEntity(entity, type, entityType, node)); } @@ -746,38 +775,19 @@ public class EntityCreationService { public void addEntityToGraph(TextEntity entity, SemanticNode node) { DocumentTree documentTree = node.getDocumentTree(); - try { - if (node.getEntities().contains(entity)) { - // If entity already exists and it has a different text range, we add the text range to the list of duplicated text ranges - Optional optionalTextEntity = node.getEntities() - .stream()// - .filter(e -> e.equals(entity))// - .filter(e -> !e.getTextRange().equals(entity.getTextRange()))// - .findAny(); - if (optionalTextEntity.isPresent()) { - addDuplicateEntityToGraph(optionalTextEntity.get(), entity.getTextRange(), node); - } else { - node.getEntities().remove(entity); - addNewEntityToGraph(entity, documentTree); - } + if (node.getEntities().contains(entity)) { + // If entity already exists and it has a different text range, we add the text range to the list of duplicated text ranges + node.getEntities() + .stream()// + .filter(e -> e.equals(entity))// + .filter(e -> !e.getTextRange().equals(entity.getTextRange()))// + .findAny() + .ifPresent(e -> addDuplicateEntityToGraph(e, entity.getTextRange(), node)); - } else { - entity.addIntersectingNode(documentTree.getRoot().getNode()); - addEntityToGraph(entity, documentTree); - } - } catch (NoSuchElementException e) { - addNewEntityToGraph(entity, documentTree); + } else { + addEntityToGraph(entity, documentTree); } - } - - private void addNewEntityToGraph(TextEntity entity, DocumentTree documentTree) { - - entity.setDeepestFullyContainingNode(documentTree.getRoot().getNode()); - entityEnrichmentService.enrichEntity(entity, entity.getDeepestFullyContainingNode().getTextBlock()); - entity.addIntersectingNode(documentTree.getRoot().getNode()); - addToPages(entity); - addEntityToNodeEntitySets(entity); } @@ -788,10 +798,11 @@ public class EntityCreationService { SemanticNode deepestSharedNode = entityToDuplicate.getIntersectingNodes() .stream() .sorted(Comparator.comparingInt(n -> -n.getTreeId().size())) - .filter(intersectingNode -> entityToDuplicate.getDuplicateTextRanges().stream().allMatch(tr -> intersectingNode.getTextRange().contains(tr)) && // - intersectingNode.getTextRange().contains(entityToDuplicate.getTextRange())) - .findFirst() - .orElse(node.getDocumentTree().getRoot().getNode()); + .filter(intersectingNode -> entityToDuplicate.getDuplicateTextRanges() + .stream() + .allMatch(tr -> intersectingNode.getTextRange().contains(tr)) && // + intersectingNode.getTextRange().contains(entityToDuplicate.getTextRange())) + .findFirst().orElse(node.getDocumentTree().getRoot().getNode()); entityToDuplicate.setDeepestFullyContainingNode(deepestSharedNode); @@ -802,7 +813,8 @@ public class EntityCreationService { return; } additionalIntersectingNode.getEntities().add(entityToDuplicate); - additionalIntersectingNode.getPages(newTextRange).forEach(page -> page.getEntities().add(entityToDuplicate)); + additionalIntersectingNode.getPages(newTextRange) + .forEach(page -> page.getEntities().add(entityToDuplicate)); entityToDuplicate.addIntersectingNode(additionalIntersectingNode); }); } @@ -810,12 +822,7 @@ public class EntityCreationService { private void addEntityToGraph(TextEntity entity, DocumentTree documentTree) { - SemanticNode containingNode = documentTree.childNodes(Collections.emptyList()) - .filter(node -> node.getTextBlock().containsTextRange(entity.getTextRange())) - .findFirst() - .orElseThrow(() -> new NoSuchElementException("No containing Node found!")); - - containingNode.addThisToEntityIfIntersects(entity); + documentTree.getRoot().getNode().addThisToEntityIfIntersects(entity); TextBlock textBlock = entity.getDeepestFullyContainingNode().getTextBlock(); entityEnrichmentService.enrichEntity(entity, textBlock); @@ -824,5 +831,4 @@ public class EntityCreationService { addEntityToNodeEntitySets(entity); } - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java new file mode 100644 index 00000000..e79035e5 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java @@ -0,0 +1,327 @@ +package com.iqser.red.service.redaction.v1.server; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.when; + +import java.io.File; +import java.io.FileInputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.context.annotation.Import; +import org.springframework.core.io.ClassPathResource; +import org.springframework.test.context.junit.jupiter.SpringExtension; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Sets; +import com.iqser.red.commons.jackson.ObjectMapperFactory; +import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel; +import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest; +import com.iqser.red.service.persistence.service.v1.api.shared.model.RuleFileType; +import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.ManualRedactions; +import com.iqser.red.service.persistence.service.v1.api.shared.model.common.JSONPrimitive; +import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; +import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; +import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; +import com.iqser.red.service.redaction.v1.server.client.RulesClient; +import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary; +import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrement; +import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel; +import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion; +import com.iqser.red.service.redaction.v1.server.service.AnalyzeService; +import com.iqser.red.service.redaction.v1.server.service.DictionaryService; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; +import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException; +import com.iqser.red.storage.commons.service.StorageService; +import com.knecon.fforesight.tenantcommons.TenantsClient; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@ExtendWith(SpringExtension.class) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) +@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class) +@Disabled +/* + * This test is meant to be used directly with a download from blob storage (e.g. minio). You need to define the dossier template you want to use by supplying an absolute path. + * The dossier template will then be parsed for dictionaries, colors, entities, and rules. This is defined for the all tests once. + * Inside a test you supply a path to your minio download folder. The files should still be zipped in this folder. + * The files will then be checked for completeness and uploaded to the FileSystemBackedStorageService. + * This way you can recreate what is happening on the stack almost exactly. + */ public class AnalysisEnd2EndTest { + + Path dossierTemplateToUse = Path.of("/home/kschuettler/iqser/business-logic/redactmanager/prod-cp-eu-reg/EFSA_sanitisation_GFL_v1"); // Add your dossier-template here + ObjectMapper mapper = ObjectMapperFactory.create(); + final String TENANT_ID = "tenant"; + + @Autowired + StorageService storageService; + + @Autowired + protected AnalyzeService analyzeService; + + @MockBean + DictionaryService dictionaryService; + + @MockBean + RabbitTemplate rabbitTemplate; + + TestDossierTemplate testDossierTemplate; + @MockBean + protected LegalBasisClient legalBasisClient; + + @MockBean + private TenantsClient tenantsClient; + + @MockBean + protected RulesClient rulesClient; + + @MockBean + protected DictionaryClient dictionaryClient; + + + @Test + @SneakyThrows + public void runAnalysisEnd2End() { + + String folder = "files/end2end/file0"; // Should contain all files from minio directly, still zipped. Can contain multiple files. + + Path absoluteFolderPath; + if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path + ClassPathResource classPathResource = new ClassPathResource(folder); + absoluteFolderPath = classPathResource.getFile().toPath(); + } else { + absoluteFolderPath = Path.of(folder); + } + + log.info("Starting end2end analyses for all distinct filenames in folder: {}", folder); + List analyzeRequests = prepareStorageForFolder(absoluteFolderPath); + log.info("Found {} distinct fileIds", analyzeRequests.size()); + for (int i = 0; i < analyzeRequests.size(); i++) { + AnalyzeRequest analyzeRequest = analyzeRequests.get(i); + log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId()); + analyzeService.analyze(analyzeRequest); + } + } + + + @BeforeEach + public void setup() { + + testDossierTemplate = new TestDossierTemplate(dossierTemplateToUse); + when(dictionaryService.updateDictionary(any(), any())).thenReturn(new DictionaryVersion(0, 0)); + when(dictionaryService.getDeepCopyDictionary(any(), any())).thenReturn(testDossierTemplate.testDictionary); + when(dictionaryService.getDictionaryIncrements(any(), any(), any())).thenReturn(new DictionaryIncrement(Collections.emptySet(), new DictionaryVersion(0, 0))); + when(dictionaryService.isHint(any(String.class), any())).thenAnswer(invocation -> { + String type = invocation.getArgument(0); + return testDossierTemplate.testDictionary.getType(type).isHint(); + }); + when(dictionaryService.getColor(any(String.class), any())).thenAnswer(invocation -> { + String type = invocation.getArgument(0); + return testDossierTemplate.testDictionary.getType(type).getColor(); + }); + when(dictionaryService.getNotRedactedColor(any())).thenReturn(new float[]{0.2f, 0.2f, 0.2f}); + + when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(System.currentTimeMillis()); + when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(JSONPrimitive.of(testDossierTemplate.rules)); + when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.COMPONENT)).thenReturn(testDossierTemplate.componentRules != null ? System.currentTimeMillis() : -1); + when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.COMPONENT)).thenReturn(JSONPrimitive.of(testDossierTemplate.componentRules)); + } + + + @SneakyThrows + private List prepareStorageForFolder(Path folder) { + + return Files.list(folder) + .map(this::parseFileId) + .distinct() + .map(fileId -> prepareStorageForFile(fileId, folder)) + .toList(); + } + + + private String parseFileId(Path path) { + + return path.getFileName().toString().split("\\.")[0]; + } + + + @SneakyThrows + private AnalyzeRequest prepareStorageForFile(String fileId, Path folder) { + + AnalyzeRequest request = new AnalyzeRequest(); + request.setDossierId(UUID.randomUUID().toString()); + request.setFileId(UUID.randomUUID().toString()); + request.setDossierTemplateId(testDossierTemplate.id); + request.setManualRedactions(new ManualRedactions()); + request.setAnalysisNumber(-1); + + Set endingsToUpload = Set.of("ORIGIN", + "DOCUMENT_PAGES", + "DOCUMENT_POSITION", + "DOCUMENT_STRUCTURE", + "DOCUMENT_TEXT", + "IMAGE_INFO", + "NER_ENTITIES", + "TABLES", + "IMPORTED_REDACTIONS") + .stream() + .map(FileType::valueOf) + .collect(Collectors.toSet()); + + Set uploadedFileTypes = Files.walk(folder) + .filter(path -> path.toFile().isFile()) + .filter(path -> endingsToUpload.contains(parseFileTypeFromPath(path))) + .map(filePath -> uploadFile(filePath, request)) + .collect(Collectors.toUnmodifiableSet()); + + Set missingFileTypes = Sets.difference(endingsToUpload, uploadedFileTypes); + if (!missingFileTypes.isEmpty()) { + log.error("Folder {} is missing files of type {}", + folder.toFile(), + missingFileTypes.stream() + .map(Enum::toString) + .collect(Collectors.joining(", "))); + throw new NotFoundException("Not all required file types are present."); + } + return request; + } + + + private static FileType parseFileTypeFromPath(Path path) { + + return FileType.valueOf(path.getFileName().toString().split("\\.")[1]); + } + + + @SneakyThrows + private FileType uploadFile(Path path, AnalyzeRequest request) { + + FileType fileType = parseFileTypeFromPath(path); + try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) { + storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType), in); + + } + return fileType; + } + + + private class TestDossierTemplate { + + String id; + Dictionary testDictionary; + AtomicInteger dictEntryIdCounter = new AtomicInteger(0); + String rules; + String componentRules; + + + @SneakyThrows + TestDossierTemplate(Path dossierTemplateToUse) { + + Map dossierTemplate = mapper.readValue(dossierTemplateToUse.resolve("dossierTemplate.json").toFile(), HashMap.class); + this.id = (String) dossierTemplate.get("dossierTemplateId"); + List dictionaries = Files.walk(dossierTemplateToUse) + .filter(path -> path.getFileName().toString().equals("dossierType.json")) + .map(this::loadDictionaryModel) + .toList(); + File ruleFile = dossierTemplateToUse.resolve("rules.drl").toFile(); + rules = new String(Files.readAllBytes(ruleFile.toPath())); + + File componentRuleFile = dossierTemplateToUse.resolve("componentRules.drl").toFile(); + if (componentRuleFile.exists()) { + componentRules = new String(Files.readAllBytes(componentRuleFile.toPath())); + } + + testDictionary = new Dictionary(dictionaries, new DictionaryVersion(0, 0)); + } + + + @SneakyThrows + private DictionaryModel loadDictionaryModel(Path path) { + + Map model = mapper.readValue(path.toFile(), HashMap.class); + Set entries = new HashSet<>(); + Set falsePositives = new HashSet<>(); + Set falseRecommendations = new HashSet<>(); + + String type = (String) model.get("type"); + Integer rank = (Integer) model.get("rank"); + float[] color = hexToFloatArr((String) model.get("hexColor")); + Boolean caseInsensitive = (Boolean) model.get("caseInsensitive"); + Boolean hint = (Boolean) model.get("hint"); + Boolean hasDictionary = (Boolean) model.get("hasDictionary"); + + boolean isDossierDictionary; + if (model.containsKey("dossierDictionaryOnly")) { + isDossierDictionary = true; + } else { + isDossierDictionary = ((String) model.get("id")).split(":").length == 3; + } + + if (hasDictionary) { + try (var in = new FileInputStream(path.getParent().resolve("entries.txt").toFile())) { + entries.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId"))); + } + try (var in = new FileInputStream(path.getParent().resolve("falsePositives.txt").toFile())) { + falsePositives.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId"))); + } + try (var in = new FileInputStream(path.getParent().resolve("falseRecommendations.txt").toFile())) { + falseRecommendations.addAll(parseDictionaryEntryModelFromFile(new String(in.readAllBytes()), dictEntryIdCounter, (String) model.get("typeId"))); + } + } + + return new DictionaryModel(type, rank, color, caseInsensitive, hint, entries, falsePositives, falseRecommendations, isDossierDictionary); + } + + + private Set parseDictionaryEntryModelFromFile(String s, AtomicInteger dictEntryIdCounter, String typeId) { + + String[] values = s.split("\n"); + return Arrays.stream(values) + .map(value -> new DictionaryEntryModel(dictEntryIdCounter.getAndIncrement(), value, 0L, false, typeId)) + .collect(Collectors.toUnmodifiableSet()); + } + + + private float[] hexToFloatArr(String hexColor) { + + // Remove # symbol if present + String cleanHexColor = hexColor.replace("#", ""); + + // Parse hex string into RGB components + int r = Integer.parseInt(cleanHexColor.substring(0, 2), 16); + int g = Integer.parseInt(cleanHexColor.substring(2, 4), 16); + int b = Integer.parseInt(cleanHexColor.substring(4, 6), 16); + + // Normalize RGB values to floats between 0 and 1 + float[] rgbFloat = new float[3]; + rgbFloat[0] = r / 255.0f; + rgbFloat[1] = g / 255.0f; + rgbFloat[2] = b / 255.0f; + + return rgbFloat; + } + + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/log4j2-test.xml b/redaction-service-v1/redaction-service-server-v1/src/test/resources/log4j2-test.xml deleted file mode 100644 index b4895cfb..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/log4j2-test.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - - - - - - - - - - - diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/logback-spring.xml b/redaction-service-v1/redaction-service-server-v1/src/test/resources/logback-spring.xml new file mode 100644 index 00000000..33b2cef7 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/logback-spring.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file