Merge branch 'RED-9964-fp' into 'master'
RED-9964: refactor getMainBody() and getMainBodyTextBlock() in Page Closes RED-9964 See merge request redactmanager/redaction-service!501
This commit is contained in:
commit
895bc56590
@ -3,8 +3,10 @@ package com.iqser.red.service.redaction.v1.server.model.document.nodes;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
|
||||
|
||||
@ -35,7 +37,7 @@ public class Page {
|
||||
Integer width;
|
||||
Integer rotation;
|
||||
|
||||
List<SemanticNode> mainBody;
|
||||
List<AtomicTextBlock> textBlocksOnPage;
|
||||
Header header;
|
||||
Footer footer;
|
||||
|
||||
@ -53,13 +55,62 @@ public class Page {
|
||||
*/
|
||||
public TextBlock getMainBodyTextBlock() {
|
||||
|
||||
return mainBody.stream()
|
||||
.filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getTextBlock)
|
||||
return textBlocksOnPage.stream()
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the highest SemanticNodes, which appear only on this page. It is achieved by traversing the DocumentTree up, until a SemanticNode's direct parent is no longer exclusively on this page.
|
||||
*
|
||||
* @return A list which contains the highes SemanticNodes, which appear only on this page.
|
||||
*/
|
||||
public List<SemanticNode> getMainBody() {
|
||||
|
||||
return textBlocksOnPage.stream()
|
||||
.map(AtomicTextBlock::getParent)
|
||||
.map(this::getHighestParentOnlyOnPage)
|
||||
.distinct()
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the highest SemanticNodes which are present on the page. There might be multiples, as two or more Main Sections start on a page.
|
||||
* This is achieved by traversing up the document tree and returning all SemanticNodes whose direct parent is the Document
|
||||
*
|
||||
* @return A list of the highest SemanticNodes present on this page
|
||||
*/
|
||||
public Stream<SemanticNode> streamHighestSemanticNodesOnPage() {
|
||||
|
||||
return textBlocksOnPage.stream()
|
||||
.map(AtomicTextBlock::getParent)
|
||||
.map(this::getHighestSemanticNodeOnPage)
|
||||
.distinct();
|
||||
}
|
||||
|
||||
|
||||
private SemanticNode getHighestParentOnlyOnPage(SemanticNode node) {
|
||||
|
||||
SemanticNode currentNode = node;
|
||||
while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) {
|
||||
currentNode = currentNode.getParent();
|
||||
}
|
||||
return currentNode;
|
||||
}
|
||||
|
||||
|
||||
private SemanticNode getHighestSemanticNodeOnPage(SemanticNode node) {
|
||||
|
||||
SemanticNode currentNode = node;
|
||||
while (currentNode.hasParent() //
|
||||
&& !currentNode.getParent().getType().equals(NodeType.DOCUMENT)) {
|
||||
currentNode = currentNode.getParent();
|
||||
}
|
||||
return currentNode;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
|
||||
@ -808,4 +808,17 @@ public interface SemanticNode {
|
||||
streamChildren().forEach(childNode -> childNode.accept(visitor));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
|
||||
*
|
||||
* @param page the page to check
|
||||
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
|
||||
*/
|
||||
default boolean onlyOnPage(Page page) {
|
||||
|
||||
Set<Page> pages = getPages();
|
||||
return pages.size() == 1 && pages.contains(page);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.service.document;
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
@ -64,7 +63,7 @@ public class DocumentGraphMapper {
|
||||
for (DocumentStructure.EntryData entryData : entries) {
|
||||
|
||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
|
||||
.map(pageNumber -> getPage(pageNumber, context))
|
||||
.map(context::getPage)
|
||||
.toList();
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
@ -83,6 +82,14 @@ public class DocumentGraphMapper {
|
||||
if (entryData.getAtomicBlockIds().length > 0) {
|
||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
|
||||
switch (entryData.getType()) {
|
||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||
default -> textBlock.getAtomicTextBlocks()
|
||||
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
|
||||
}
|
||||
|
||||
}
|
||||
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
|
||||
.toList();
|
||||
@ -94,13 +101,8 @@ public class DocumentGraphMapper {
|
||||
}
|
||||
node.setTreeId(treeId);
|
||||
|
||||
switch (entryData.getType()) {
|
||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||
default -> pages.forEach(page -> page.getMainBody().add(node));
|
||||
}
|
||||
|
||||
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
||||
|
||||
}
|
||||
return newEntries;
|
||||
}
|
||||
@ -115,7 +117,7 @@ public class DocumentGraphMapper {
|
||||
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
|
||||
|
||||
assert pageNumbers.length == 1;
|
||||
Page page = getPage(pageNumbers[0], context);
|
||||
Page page = context.getPage(pageNumbers[0]);
|
||||
var builder = Image.builder();
|
||||
PropertiesMapper.parseImageProperties(properties, builder);
|
||||
return builder.documentTree(context.documentTree).page(page).build();
|
||||
@ -161,6 +163,7 @@ public class DocumentGraphMapper {
|
||||
return SuperSection.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
||||
|
||||
if (PropertiesMapper.isDuplicateParagraph(properties)) {
|
||||
@ -189,21 +192,13 @@ public class DocumentGraphMapper {
|
||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||
context.getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage()));
|
||||
}
|
||||
|
||||
|
||||
private Page buildPage(DocumentPage p) {
|
||||
|
||||
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
||||
}
|
||||
|
||||
|
||||
private Page getPage(Long pageIndex, Context context) {
|
||||
|
||||
Page page = context.pageData.get(Math.toIntExact(pageIndex) - 1);
|
||||
assert page.getNumber() == Math.toIntExact(pageIndex);
|
||||
return page;
|
||||
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
|
||||
}
|
||||
|
||||
|
||||
@ -226,6 +221,14 @@ public class DocumentGraphMapper {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Page getPage(Long pageIndex) {
|
||||
|
||||
Page page = pageData.get(Math.toIntExact(pageIndex) - 1);
|
||||
assert page.getNumber() == Math.toIntExact(pageIndex);
|
||||
return page;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -11,7 +11,6 @@ import java.util.stream.Stream;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLogEntry;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Position;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedaction;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedactions;
|
||||
@ -27,7 +26,6 @@ import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncr
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrementValue;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
||||
|
||||
import io.micrometer.core.annotation.Timed;
|
||||
import lombok.AccessLevel;
|
||||
@ -51,7 +49,6 @@ public class SectionFinderService {
|
||||
long start = System.currentTimeMillis();
|
||||
Set<Integer> sectionsToReanalyse = new HashSet<>();
|
||||
|
||||
|
||||
var dictionaryIncrementsSearch = new SearchImplementation(dictionaryIncrement.getValues()
|
||||
.stream()
|
||||
.map(DictionaryIncrementValue::getValue)
|
||||
@ -82,9 +79,7 @@ public class SectionFinderService {
|
||||
return document.getPages()
|
||||
.stream()
|
||||
.filter(page -> relevantPagesForReanalysis.contains(page.getNumber()))
|
||||
.flatMap(page -> Stream.concat(page.getMainBody()
|
||||
.stream()
|
||||
.filter(node -> node.getType().equals(NodeType.SECTION)), Stream.of(page.getHeader(), page.getFooter())))
|
||||
.flatMap(page -> Stream.concat(page.streamHighestSemanticNodesOnPage(), Stream.of(page.getHeader(), page.getFooter())))
|
||||
.map(node -> node.getTreeId()
|
||||
.get(0))
|
||||
.toList();
|
||||
|
||||
@ -9,6 +9,7 @@ import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.FileVisitOption;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -16,7 +17,6 @@ import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
@ -124,7 +124,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@SneakyThrows
|
||||
public void runAnalysisEnd2End() {
|
||||
|
||||
String folder = "/home/kschuettler/Downloads/New Folder (4)/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
|
||||
String folder = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
|
||||
|
||||
Path absoluteFolderPath;
|
||||
if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path
|
||||
@ -140,13 +140,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
for (int i = 0; i < analyzeRequests.size(); i++) {
|
||||
AnalyzeRequest analyzeRequest = analyzeRequests.get(i);
|
||||
log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId());
|
||||
var times = new LinkedList<Long>();
|
||||
for (int j = 1; j <= 10; j++) {
|
||||
var start = System.currentTimeMillis();
|
||||
analyzeService.analyze(analyzeRequest);
|
||||
times.add(System.currentTimeMillis() - start);
|
||||
}
|
||||
System.out.println("times in ms for each analyze run: " + times);
|
||||
analyzeService.analyze(analyzeRequest);
|
||||
}
|
||||
}
|
||||
|
||||
@ -222,7 +216,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
Path manualRedactionFile = folder.resolve(fileId + ".MANUAL_REDACTIONS.json");
|
||||
if (Files.exists(manualRedactionFile)) {
|
||||
request.setManualRedactions(mapper.readValue(manualRedactionFile.toFile(), ManualRedactions.class));
|
||||
request.setManualRedactions(parseManualRedactions(manualRedactionFile));
|
||||
} else {
|
||||
request.setManualRedactions(new ManualRedactions());
|
||||
}
|
||||
@ -262,6 +256,17 @@ import lombok.extern.slf4j.Slf4j;
|
||||
}
|
||||
|
||||
|
||||
private ManualRedactions parseManualRedactions(Path manualRedactionFile) {
|
||||
|
||||
try {
|
||||
return mapper.readValue(manualRedactionFile.toFile(), ManualRedactions.class);
|
||||
} catch (IOException e) {
|
||||
log.error("Could not parse manual redactions");
|
||||
return new ManualRedactions();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Optional<FileType> parseFileTypeFromPath(Path path) {
|
||||
|
||||
String fileType = path.getFileName().toString().split("\\.")[1];
|
||||
@ -280,7 +285,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
if (fileType.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
if (path.getFileName().endsWith(".gz")) {
|
||||
if (path.getFileName().toString().endsWith(".gz")) {
|
||||
try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) {
|
||||
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user