From 292869c502fd2ec9b90f4d264c5e1b49f203ec1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Mon, 2 Sep 2024 16:51:12 +0200 Subject: [PATCH] RED-9964: refactor getMainBody() and getMainBodyTextBlock() in Page --- .../v1/server/model/document/nodes/Page.java | 59 +++++++++++++++++-- .../model/document/nodes/SemanticNode.java | 13 ++++ .../service/document/DocumentGraphMapper.java | 41 +++++++------ .../document/SectionFinderService.java | 7 +-- .../v1/server/AnalysisEnd2EndTest.java | 27 +++++---- 5 files changed, 107 insertions(+), 40 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Page.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Page.java index 04d51c17..b68f0207 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Page.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Page.java @@ -3,8 +3,10 @@ package com.iqser.red.service.redaction.v1.server.model.document.nodes; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.stream.Stream; import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; +import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector; @@ -35,7 +37,7 @@ public class Page { Integer width; Integer rotation; - List mainBody; + List textBlocksOnPage; Header header; Footer footer; @@ -53,13 +55,62 @@ public class Page { */ public TextBlock getMainBodyTextBlock() { - return mainBody.stream() - .filter(SemanticNode::isLeaf) - .map(SemanticNode::getTextBlock) + return textBlocksOnPage.stream() .collect(new TextBlockCollector()); } + /** + * Retrieves the highest SemanticNodes, which appear only on this page. It is achieved by traversing the DocumentTree up, until a SemanticNode's direct parent is no longer exclusively on this page. + * + * @return A list which contains the highes SemanticNodes, which appear only on this page. + */ + public List getMainBody() { + + return textBlocksOnPage.stream() + .map(AtomicTextBlock::getParent) + .map(this::getHighestParentOnlyOnPage) + .distinct() + .toList(); + } + + + /** + * Retrieves the highest SemanticNodes which are present on the page. There might be multiples, as two or more Main Sections start on a page. + * This is achieved by traversing up the document tree and returning all SemanticNodes whose direct parent is the Document + * + * @return A list of the highest SemanticNodes present on this page + */ + public Stream streamHighestSemanticNodesOnPage() { + + return textBlocksOnPage.stream() + .map(AtomicTextBlock::getParent) + .map(this::getHighestSemanticNodeOnPage) + .distinct(); + } + + + private SemanticNode getHighestParentOnlyOnPage(SemanticNode node) { + + SemanticNode currentNode = node; + while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) { + currentNode = currentNode.getParent(); + } + return currentNode; + } + + + private SemanticNode getHighestSemanticNodeOnPage(SemanticNode node) { + + SemanticNode currentNode = node; + while (currentNode.hasParent() // + && !currentNode.getParent().getType().equals(NodeType.DOCUMENT)) { + currentNode = currentNode.getParent(); + } + return currentNode; + } + + @Override public String toString() { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java index 1c262b1d..f06669b0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/SemanticNode.java @@ -808,4 +808,17 @@ public interface SemanticNode { streamChildren().forEach(childNode -> childNode.accept(visitor)); } + + /** + * Checks wether this SemanticNode appears on a single page only, and if that page is the provided one. + * + * @param page the page to check + * @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false. + */ + default boolean onlyOnPage(Page page) { + + Set pages = getPages(); + return pages.size() == 1 && pages.contains(page); + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java index 42134e26..b4548a42 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java @@ -1,6 +1,5 @@ package com.iqser.red.service.redaction.v1.server.service.document; - import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -64,7 +63,7 @@ public class DocumentGraphMapper { for (DocumentStructure.EntryData entryData : entries) { List pages = Arrays.stream(entryData.getPageNumbers()) - .map(pageNumber -> getPage(pageNumber, context)) + .map(context::getPage) .toList(); SemanticNode node = switch (entryData.getType()) { @@ -83,6 +82,14 @@ public class DocumentGraphMapper { if (entryData.getAtomicBlockIds().length > 0) { TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node); node.setLeafTextBlock(textBlock); + + switch (entryData.getType()) { + case HEADER -> pages.forEach(page -> page.setHeader((Header) node)); + case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node)); + default -> textBlock.getAtomicTextBlocks() + .forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb)); + } + } List treeId = Arrays.stream(entryData.getTreeId()).boxed() .toList(); @@ -94,13 +101,8 @@ public class DocumentGraphMapper { } node.setTreeId(treeId); - switch (entryData.getType()) { - case HEADER -> pages.forEach(page -> page.setHeader((Header) node)); - case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node)); - default -> pages.forEach(page -> page.getMainBody().add(node)); - } - newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build()); + } return newEntries; } @@ -115,7 +117,7 @@ public class DocumentGraphMapper { private Image buildImage(Context context, Map properties, Long[] pageNumbers) { assert pageNumbers.length == 1; - Page page = getPage(pageNumbers[0], context); + Page page = context.getPage(pageNumbers[0]); var builder = Image.builder(); PropertiesMapper.parseImageProperties(properties, builder); return builder.documentTree(context.documentTree).page(page).build(); @@ -161,6 +163,7 @@ public class DocumentGraphMapper { return SuperSection.builder().documentTree(context.documentTree).build(); } + private Paragraph buildParagraph(Context context, Map properties) { if (PropertiesMapper.isDuplicateParagraph(properties)) { @@ -189,21 +192,13 @@ public class DocumentGraphMapper { return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)), context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)), parent, - getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); + context.getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage())); } private Page buildPage(DocumentPage p) { - return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build(); - } - - - private Page getPage(Long pageIndex, Context context) { - - Page page = context.pageData.get(Math.toIntExact(pageIndex) - 1); - assert page.getNumber() == Math.toIntExact(pageIndex); - return page; + return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build(); } @@ -226,6 +221,14 @@ public class DocumentGraphMapper { } + + private Page getPage(Long pageIndex) { + + Page page = pageData.get(Math.toIntExact(pageIndex) - 1); + assert page.getNumber() == Math.toIntExact(pageIndex); + return page; + } + } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/SectionFinderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/SectionFinderService.java index f45a3299..1973681c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/SectionFinderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/SectionFinderService.java @@ -11,7 +11,6 @@ import java.util.stream.Stream; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest; -import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLogEntry; import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Position; import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedaction; import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedactions; @@ -27,7 +26,6 @@ import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncr import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrementValue; import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; -import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType; import io.micrometer.core.annotation.Timed; import lombok.AccessLevel; @@ -51,7 +49,6 @@ public class SectionFinderService { long start = System.currentTimeMillis(); Set sectionsToReanalyse = new HashSet<>(); - var dictionaryIncrementsSearch = new SearchImplementation(dictionaryIncrement.getValues() .stream() .map(DictionaryIncrementValue::getValue) @@ -82,9 +79,7 @@ public class SectionFinderService { return document.getPages() .stream() .filter(page -> relevantPagesForReanalysis.contains(page.getNumber())) - .flatMap(page -> Stream.concat(page.getMainBody() - .stream() - .filter(node -> node.getType().equals(NodeType.SECTION)), Stream.of(page.getHeader(), page.getFooter()))) + .flatMap(page -> Stream.concat(page.streamHighestSemanticNodesOnPage(), Stream.of(page.getHeader(), page.getFooter()))) .map(node -> node.getTreeId() .get(0)) .toList(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java index 7b0cc6bc..6530373b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java @@ -9,6 +9,7 @@ import static org.mockito.Mockito.when; import java.io.File; import java.io.FileInputStream; +import java.io.IOException; import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.nio.file.Path; @@ -16,7 +17,6 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Optional; @@ -124,7 +124,7 @@ import lombok.extern.slf4j.Slf4j; @SneakyThrows public void runAnalysisEnd2End() { - String folder = "/home/kschuettler/Downloads/New Folder (4)/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files. + String folder = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files. Path absoluteFolderPath; if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path @@ -140,13 +140,7 @@ import lombok.extern.slf4j.Slf4j; for (int i = 0; i < analyzeRequests.size(); i++) { AnalyzeRequest analyzeRequest = analyzeRequests.get(i); log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId()); - var times = new LinkedList(); - for (int j = 1; j <= 10; j++) { - var start = System.currentTimeMillis(); - analyzeService.analyze(analyzeRequest); - times.add(System.currentTimeMillis() - start); - } - System.out.println("times in ms for each analyze run: " + times); + analyzeService.analyze(analyzeRequest); } } @@ -222,7 +216,7 @@ import lombok.extern.slf4j.Slf4j; Path manualRedactionFile = folder.resolve(fileId + ".MANUAL_REDACTIONS.json"); if (Files.exists(manualRedactionFile)) { - request.setManualRedactions(mapper.readValue(manualRedactionFile.toFile(), ManualRedactions.class)); + request.setManualRedactions(parseManualRedactions(manualRedactionFile)); } else { request.setManualRedactions(new ManualRedactions()); } @@ -262,6 +256,17 @@ import lombok.extern.slf4j.Slf4j; } + private ManualRedactions parseManualRedactions(Path manualRedactionFile) { + + try { + return mapper.readValue(manualRedactionFile.toFile(), ManualRedactions.class); + } catch (IOException e) { + log.error("Could not parse manual redactions"); + return new ManualRedactions(); + } + } + + private static Optional parseFileTypeFromPath(Path path) { String fileType = path.getFileName().toString().split("\\.")[1]; @@ -280,7 +285,7 @@ import lombok.extern.slf4j.Slf4j; if (fileType.isEmpty()) { return Optional.empty(); } - if (path.getFileName().endsWith(".gz")) { + if (path.getFileName().toString().endsWith(".gz")) { try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) { storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in); }