Merge branch 'RED-9964-fp' into 'master'
RED-9964: refactor getMainBody() and getMainBodyTextBlock() in Page Closes RED-9964 See merge request redactmanager/redaction-service!501
This commit is contained in:
commit
895bc56590
@ -3,8 +3,10 @@ package com.iqser.red.service.redaction.v1.server.model.document.nodes;
|
|||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
|
||||||
|
|
||||||
@ -35,7 +37,7 @@ public class Page {
|
|||||||
Integer width;
|
Integer width;
|
||||||
Integer rotation;
|
Integer rotation;
|
||||||
|
|
||||||
List<SemanticNode> mainBody;
|
List<AtomicTextBlock> textBlocksOnPage;
|
||||||
Header header;
|
Header header;
|
||||||
Footer footer;
|
Footer footer;
|
||||||
|
|
||||||
@ -53,13 +55,62 @@ public class Page {
|
|||||||
*/
|
*/
|
||||||
public TextBlock getMainBodyTextBlock() {
|
public TextBlock getMainBodyTextBlock() {
|
||||||
|
|
||||||
return mainBody.stream()
|
return textBlocksOnPage.stream()
|
||||||
.filter(SemanticNode::isLeaf)
|
|
||||||
.map(SemanticNode::getTextBlock)
|
|
||||||
.collect(new TextBlockCollector());
|
.collect(new TextBlockCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the highest SemanticNodes, which appear only on this page. It is achieved by traversing the DocumentTree up, until a SemanticNode's direct parent is no longer exclusively on this page.
|
||||||
|
*
|
||||||
|
* @return A list which contains the highes SemanticNodes, which appear only on this page.
|
||||||
|
*/
|
||||||
|
public List<SemanticNode> getMainBody() {
|
||||||
|
|
||||||
|
return textBlocksOnPage.stream()
|
||||||
|
.map(AtomicTextBlock::getParent)
|
||||||
|
.map(this::getHighestParentOnlyOnPage)
|
||||||
|
.distinct()
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the highest SemanticNodes which are present on the page. There might be multiples, as two or more Main Sections start on a page.
|
||||||
|
* This is achieved by traversing up the document tree and returning all SemanticNodes whose direct parent is the Document
|
||||||
|
*
|
||||||
|
* @return A list of the highest SemanticNodes present on this page
|
||||||
|
*/
|
||||||
|
public Stream<SemanticNode> streamHighestSemanticNodesOnPage() {
|
||||||
|
|
||||||
|
return textBlocksOnPage.stream()
|
||||||
|
.map(AtomicTextBlock::getParent)
|
||||||
|
.map(this::getHighestSemanticNodeOnPage)
|
||||||
|
.distinct();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private SemanticNode getHighestParentOnlyOnPage(SemanticNode node) {
|
||||||
|
|
||||||
|
SemanticNode currentNode = node;
|
||||||
|
while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) {
|
||||||
|
currentNode = currentNode.getParent();
|
||||||
|
}
|
||||||
|
return currentNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private SemanticNode getHighestSemanticNodeOnPage(SemanticNode node) {
|
||||||
|
|
||||||
|
SemanticNode currentNode = node;
|
||||||
|
while (currentNode.hasParent() //
|
||||||
|
&& !currentNode.getParent().getType().equals(NodeType.DOCUMENT)) {
|
||||||
|
currentNode = currentNode.getParent();
|
||||||
|
}
|
||||||
|
return currentNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
|
|||||||
@ -808,4 +808,17 @@ public interface SemanticNode {
|
|||||||
streamChildren().forEach(childNode -> childNode.accept(visitor));
|
streamChildren().forEach(childNode -> childNode.accept(visitor));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
|
||||||
|
*
|
||||||
|
* @param page the page to check
|
||||||
|
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
|
||||||
|
*/
|
||||||
|
default boolean onlyOnPage(Page page) {
|
||||||
|
|
||||||
|
Set<Page> pages = getPages();
|
||||||
|
return pages.size() == 1 && pages.contains(page);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.service.document;
|
package com.iqser.red.service.redaction.v1.server.service.document;
|
||||||
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -64,7 +63,7 @@ public class DocumentGraphMapper {
|
|||||||
for (DocumentStructure.EntryData entryData : entries) {
|
for (DocumentStructure.EntryData entryData : entries) {
|
||||||
|
|
||||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
|
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
|
||||||
.map(pageNumber -> getPage(pageNumber, context))
|
.map(context::getPage)
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
SemanticNode node = switch (entryData.getType()) {
|
SemanticNode node = switch (entryData.getType()) {
|
||||||
@ -83,6 +82,14 @@ public class DocumentGraphMapper {
|
|||||||
if (entryData.getAtomicBlockIds().length > 0) {
|
if (entryData.getAtomicBlockIds().length > 0) {
|
||||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
||||||
node.setLeafTextBlock(textBlock);
|
node.setLeafTextBlock(textBlock);
|
||||||
|
|
||||||
|
switch (entryData.getType()) {
|
||||||
|
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||||
|
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||||
|
default -> textBlock.getAtomicTextBlocks()
|
||||||
|
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
|
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
|
||||||
.toList();
|
.toList();
|
||||||
@ -94,13 +101,8 @@ public class DocumentGraphMapper {
|
|||||||
}
|
}
|
||||||
node.setTreeId(treeId);
|
node.setTreeId(treeId);
|
||||||
|
|
||||||
switch (entryData.getType()) {
|
|
||||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
|
||||||
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
|
||||||
default -> pages.forEach(page -> page.getMainBody().add(node));
|
|
||||||
}
|
|
||||||
|
|
||||||
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
||||||
|
|
||||||
}
|
}
|
||||||
return newEntries;
|
return newEntries;
|
||||||
}
|
}
|
||||||
@ -115,7 +117,7 @@ public class DocumentGraphMapper {
|
|||||||
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
|
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
|
||||||
|
|
||||||
assert pageNumbers.length == 1;
|
assert pageNumbers.length == 1;
|
||||||
Page page = getPage(pageNumbers[0], context);
|
Page page = context.getPage(pageNumbers[0]);
|
||||||
var builder = Image.builder();
|
var builder = Image.builder();
|
||||||
PropertiesMapper.parseImageProperties(properties, builder);
|
PropertiesMapper.parseImageProperties(properties, builder);
|
||||||
return builder.documentTree(context.documentTree).page(page).build();
|
return builder.documentTree(context.documentTree).page(page).build();
|
||||||
@ -161,6 +163,7 @@ public class DocumentGraphMapper {
|
|||||||
return SuperSection.builder().documentTree(context.documentTree).build();
|
return SuperSection.builder().documentTree(context.documentTree).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
||||||
|
|
||||||
if (PropertiesMapper.isDuplicateParagraph(properties)) {
|
if (PropertiesMapper.isDuplicateParagraph(properties)) {
|
||||||
@ -189,21 +192,13 @@ public class DocumentGraphMapper {
|
|||||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)),
|
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)),
|
||||||
context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)),
|
context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)),
|
||||||
parent,
|
parent,
|
||||||
getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
context.getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Page buildPage(DocumentPage p) {
|
private Page buildPage(DocumentPage p) {
|
||||||
|
|
||||||
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private Page getPage(Long pageIndex, Context context) {
|
|
||||||
|
|
||||||
Page page = context.pageData.get(Math.toIntExact(pageIndex) - 1);
|
|
||||||
assert page.getNumber() == Math.toIntExact(pageIndex);
|
|
||||||
return page;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -226,6 +221,14 @@ public class DocumentGraphMapper {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Page getPage(Long pageIndex) {
|
||||||
|
|
||||||
|
Page page = pageData.get(Math.toIntExact(pageIndex) - 1);
|
||||||
|
assert page.getNumber() == Math.toIntExact(pageIndex);
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -11,7 +11,6 @@ import java.util.stream.Stream;
|
|||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLogEntry;
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Position;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Position;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedaction;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedaction;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedactions;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.imported.ImportedRedactions;
|
||||||
@ -27,7 +26,6 @@ import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncr
|
|||||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrementValue;
|
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrementValue;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
|
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
|
||||||
|
|
||||||
import io.micrometer.core.annotation.Timed;
|
import io.micrometer.core.annotation.Timed;
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
@ -51,7 +49,6 @@ public class SectionFinderService {
|
|||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
Set<Integer> sectionsToReanalyse = new HashSet<>();
|
Set<Integer> sectionsToReanalyse = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
var dictionaryIncrementsSearch = new SearchImplementation(dictionaryIncrement.getValues()
|
var dictionaryIncrementsSearch = new SearchImplementation(dictionaryIncrement.getValues()
|
||||||
.stream()
|
.stream()
|
||||||
.map(DictionaryIncrementValue::getValue)
|
.map(DictionaryIncrementValue::getValue)
|
||||||
@ -82,9 +79,7 @@ public class SectionFinderService {
|
|||||||
return document.getPages()
|
return document.getPages()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(page -> relevantPagesForReanalysis.contains(page.getNumber()))
|
.filter(page -> relevantPagesForReanalysis.contains(page.getNumber()))
|
||||||
.flatMap(page -> Stream.concat(page.getMainBody()
|
.flatMap(page -> Stream.concat(page.streamHighestSemanticNodesOnPage(), Stream.of(page.getHeader(), page.getFooter())))
|
||||||
.stream()
|
|
||||||
.filter(node -> node.getType().equals(NodeType.SECTION)), Stream.of(page.getHeader(), page.getFooter())))
|
|
||||||
.map(node -> node.getTreeId()
|
.map(node -> node.getTreeId()
|
||||||
.get(0))
|
.get(0))
|
||||||
.toList();
|
.toList();
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import static org.mockito.Mockito.when;
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.file.FileVisitOption;
|
import java.nio.file.FileVisitOption;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -16,7 +17,6 @@ import java.util.Arrays;
|
|||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -124,7 +124,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void runAnalysisEnd2End() {
|
public void runAnalysisEnd2End() {
|
||||||
|
|
||||||
String folder = "/home/kschuettler/Downloads/New Folder (4)/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
|
String folder = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
|
||||||
|
|
||||||
Path absoluteFolderPath;
|
Path absoluteFolderPath;
|
||||||
if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path
|
if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path
|
||||||
@ -140,13 +140,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
for (int i = 0; i < analyzeRequests.size(); i++) {
|
for (int i = 0; i < analyzeRequests.size(); i++) {
|
||||||
AnalyzeRequest analyzeRequest = analyzeRequests.get(i);
|
AnalyzeRequest analyzeRequest = analyzeRequests.get(i);
|
||||||
log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId());
|
log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId());
|
||||||
var times = new LinkedList<Long>();
|
analyzeService.analyze(analyzeRequest);
|
||||||
for (int j = 1; j <= 10; j++) {
|
|
||||||
var start = System.currentTimeMillis();
|
|
||||||
analyzeService.analyze(analyzeRequest);
|
|
||||||
times.add(System.currentTimeMillis() - start);
|
|
||||||
}
|
|
||||||
System.out.println("times in ms for each analyze run: " + times);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -222,7 +216,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
|
|
||||||
Path manualRedactionFile = folder.resolve(fileId + ".MANUAL_REDACTIONS.json");
|
Path manualRedactionFile = folder.resolve(fileId + ".MANUAL_REDACTIONS.json");
|
||||||
if (Files.exists(manualRedactionFile)) {
|
if (Files.exists(manualRedactionFile)) {
|
||||||
request.setManualRedactions(mapper.readValue(manualRedactionFile.toFile(), ManualRedactions.class));
|
request.setManualRedactions(parseManualRedactions(manualRedactionFile));
|
||||||
} else {
|
} else {
|
||||||
request.setManualRedactions(new ManualRedactions());
|
request.setManualRedactions(new ManualRedactions());
|
||||||
}
|
}
|
||||||
@ -262,6 +256,17 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private ManualRedactions parseManualRedactions(Path manualRedactionFile) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
return mapper.readValue(manualRedactionFile.toFile(), ManualRedactions.class);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Could not parse manual redactions");
|
||||||
|
return new ManualRedactions();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Optional<FileType> parseFileTypeFromPath(Path path) {
|
private static Optional<FileType> parseFileTypeFromPath(Path path) {
|
||||||
|
|
||||||
String fileType = path.getFileName().toString().split("\\.")[1];
|
String fileType = path.getFileName().toString().split("\\.")[1];
|
||||||
@ -280,7 +285,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
if (fileType.isEmpty()) {
|
if (fileType.isEmpty()) {
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
if (path.getFileName().endsWith(".gz")) {
|
if (path.getFileName().toString().endsWith(".gz")) {
|
||||||
try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) {
|
try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) {
|
||||||
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in);
|
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user