Merge branch 'RED-9964-fp' into 'master'

RED-9964: refactor getMainBody() and getMainBodyTextBlock() in Page

Closes RED-9964

See merge request redactmanager/redaction-service!501
This commit is contained in:
Kilian Schüttler 2024-09-02 16:51:13 +02:00
commit 895bc56590
5 changed files with 107 additions and 40 deletions

View File

@ -3,8 +3,10 @@ package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
@ -35,7 +37,7 @@ public class Page {
Integer width;
Integer rotation;
List<SemanticNode> mainBody;
List<AtomicTextBlock> textBlocksOnPage;
Header header;
Footer footer;
@ -53,13 +55,62 @@ public class Page {
*/
public TextBlock getMainBodyTextBlock() {
return mainBody.stream()
.filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock)
return textBlocksOnPage.stream()
.collect(new TextBlockCollector());
}
/**
* Retrieves the highest SemanticNodes, which appear only on this page. It is achieved by traversing the DocumentTree up, until a SemanticNode's direct parent is no longer exclusively on this page.
*
* @return A list which contains the highes SemanticNodes, which appear only on this page.
*/
public List<SemanticNode> getMainBody() {
return textBlocksOnPage.stream()
.map(AtomicTextBlock::getParent)
.map(this::getHighestParentOnlyOnPage)
.distinct()
.toList();
}
/**
* Retrieves the highest SemanticNodes which are present on the page. There might be multiples, as two or more Main Sections start on a page.
* This is achieved by traversing up the document tree and returning all SemanticNodes whose direct parent is the Document
*
* @return A list of the highest SemanticNodes present on this page
*/
public Stream<SemanticNode> streamHighestSemanticNodesOnPage() {
return textBlocksOnPage.stream()
.map(AtomicTextBlock::getParent)
.map(this::getHighestSemanticNodeOnPage)
.distinct();
}
private SemanticNode getHighestParentOnlyOnPage(SemanticNode node) {
SemanticNode currentNode = node;
while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) {
currentNode = currentNode.getParent();
}
return currentNode;
}
private SemanticNode getHighestSemanticNodeOnPage(SemanticNode node) {
SemanticNode currentNode = node;
while (currentNode.hasParent() //
&& !currentNode.getParent().getType().equals(NodeType.DOCUMENT)) {
currentNode = currentNode.getParent();
}
return currentNode;
}
@Override
public String toString() {

View File

@ -808,4 +808,17 @@ public interface SemanticNode {
streamChildren().forEach(childNode -> childNode.accept(visitor));
}
/**
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
*
* @param page the page to check
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
*/
default boolean onlyOnPage(Page page) {
Set<Page> pages = getPages();
return pages.size() == 1 && pages.contains(page);
}
}

View File

@ -1,6 +1,5 @@
package com.iqser.red.service.redaction.v1.server.service.document;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -64,7 +63,7 @@ public class DocumentGraphMapper {
for (DocumentStructure.EntryData entryData : entries) {
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
.map(pageNumber -> getPage(pageNumber, context))
.map(context::getPage)
.toList();
SemanticNode node = switch (entryData.getType()) {
@ -83,6 +82,14 @@ public class DocumentGraphMapper {
if (entryData.getAtomicBlockIds().length > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
node.setLeafTextBlock(textBlock);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
default -> textBlock.getAtomicTextBlocks()
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
}
}
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
.toList();
@ -94,13 +101,8 @@ public class DocumentGraphMapper {
}
node.setTreeId(treeId);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
default -> pages.forEach(page -> page.getMainBody().add(node));
}
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
}
return newEntries;
}
@ -115,7 +117,7 @@ public class DocumentGraphMapper {
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
assert pageNumbers.length == 1;
Page page = getPage(pageNumbers[0], context);
Page page = context.getPage(pageNumbers[0]);
var builder = Image.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.documentTree(context.documentTree).page(page).build();
@ -161,6 +163,7 @@ public class DocumentGraphMapper {
return SuperSection.builder().documentTree(context.documentTree).build();
}
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
if (PropertiesMapper.isDuplicateParagraph(properties)) {
@ -189,21 +192,13 @@ public class DocumentGraphMapper {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)),
context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
context.getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage()));
}
private Page buildPage(DocumentPage p) {
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
}
private Page getPage(Long pageIndex, Context context) {
Page page = context.pageData.get(Math.toIntExact(pageIndex) - 1);
assert page.getNumber() == Math.toIntExact(pageIndex);
return page;
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
}
@ -226,6 +221,14 @@ public class DocumentGraphMapper {
}
private Page getPage(Long pageIndex) {
Page page = pageData.get(Math.toIntExact(pageIndex) - 1);
assert page.getNumber() == Math.toIntExact(pageIndex);
return page;
}
}
}

View File

@ -11,7 +11,6 @@ import java.util.stream.Stream;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLogEntry;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Position;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedaction;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedactions;
@ -27,7 +26,6 @@ import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncr
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import io.micrometer.core.annotation.Timed;
import lombok.AccessLevel;
@ -51,7 +49,6 @@ public class SectionFinderService {
long start = System.currentTimeMillis();
Set<Integer> sectionsToReanalyse = new HashSet<>();
var dictionaryIncrementsSearch = new SearchImplementation(dictionaryIncrement.getValues()
.stream()
.map(DictionaryIncrementValue::getValue)
@ -82,9 +79,7 @@ public class SectionFinderService {
return document.getPages()
.stream()
.filter(page -> relevantPagesForReanalysis.contains(page.getNumber()))
.flatMap(page -> Stream.concat(page.getMainBody()
.stream()
.filter(node -> node.getType().equals(NodeType.SECTION)), Stream.of(page.getHeader(), page.getFooter())))
.flatMap(page -> Stream.concat(page.streamHighestSemanticNodesOnPage(), Stream.of(page.getHeader(), page.getFooter())))
.map(node -> node.getTreeId()
.get(0))
.toList();

View File

@ -9,6 +9,7 @@ import static org.mockito.Mockito.when;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.Path;
@ -16,7 +17,6 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@ -124,7 +124,7 @@ import lombok.extern.slf4j.Slf4j;
@SneakyThrows
public void runAnalysisEnd2End() {
String folder = "/home/kschuettler/Downloads/New Folder (4)/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
String folder = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
Path absoluteFolderPath;
if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path
@ -140,13 +140,7 @@ import lombok.extern.slf4j.Slf4j;
for (int i = 0; i < analyzeRequests.size(); i++) {
AnalyzeRequest analyzeRequest = analyzeRequests.get(i);
log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId());
var times = new LinkedList<Long>();
for (int j = 1; j <= 10; j++) {
var start = System.currentTimeMillis();
analyzeService.analyze(analyzeRequest);
times.add(System.currentTimeMillis() - start);
}
System.out.println("times in ms for each analyze run: " + times);
analyzeService.analyze(analyzeRequest);
}
}
@ -222,7 +216,7 @@ import lombok.extern.slf4j.Slf4j;
Path manualRedactionFile = folder.resolve(fileId + ".MANUAL_REDACTIONS.json");
if (Files.exists(manualRedactionFile)) {
request.setManualRedactions(mapper.readValue(manualRedactionFile.toFile(), ManualRedactions.class));
request.setManualRedactions(parseManualRedactions(manualRedactionFile));
} else {
request.setManualRedactions(new ManualRedactions());
}
@ -262,6 +256,17 @@ import lombok.extern.slf4j.Slf4j;
}
private ManualRedactions parseManualRedactions(Path manualRedactionFile) {
try {
return mapper.readValue(manualRedactionFile.toFile(), ManualRedactions.class);
} catch (IOException e) {
log.error("Could not parse manual redactions");
return new ManualRedactions();
}
}
private static Optional<FileType> parseFileTypeFromPath(Path path) {
String fileType = path.getFileName().toString().split("\\.")[1];
@ -280,7 +285,7 @@ import lombok.extern.slf4j.Slf4j;
if (fileType.isEmpty()) {
return Optional.empty();
}
if (path.getFileName().endsWith(".gz")) {
if (path.getFileName().toString().endsWith(".gz")) {
try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) {
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in);
}