Merge branch 'RED-8481-hotfix' into 'main'

RED-8481: Use visual layout parsing to detect signatures

See merge request fforesight/layout-parser!106
This commit is contained in:
Yannik Hampe 2024-02-29 09:39:17 +01:00
commit f146beeb44
2 changed files with 40 additions and 13 deletions

View File

@ -42,7 +42,7 @@ public class VisualLayoutParsingAdapter {
if (visualLayoutParsingResponse.getData() != null) {
visualLayoutParsingResponse.getData()
.forEach(tableData -> signatures.computeIfAbsent(tableData.getPage_idx() + 1, tableCell -> new ArrayList<>())
.addAll(convertSignatures(tableData.getPage_idx(), tableData.getBoxes())));
.addAll(convertSignatures(tableData.getPage_idx()+1, tableData.getBoxes())));
}
return signatures;

View File

@ -2,16 +2,20 @@ package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import javax.xml.parsers.DocumentBuilder;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
@ -29,20 +33,27 @@ public class DocumentDataMapper {
public DocumentData toDocumentData(Document document) {
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.distinct()
.map(DocumentDataMapper::toAtomicTextBlockData)
.toList();
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.distinct()
.map(DocumentDataMapper::toAtomicPositionBlockData)
.toList();
Set<Long> nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet());
Set<Long> nonEmptyTextBlocks = documentTextData.stream()
.mapToLong(DocumentTextData::getId).boxed()
.collect(Collectors.toSet());
List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList();
List<DocumentPage> documentPageData = document.getPages()
.stream()
.map(DocumentDataMapper::toPageData)
.toList();
DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
return DocumentData.builder()
.documentTextData(documentTextData.toArray(new DocumentTextData[0]))
@ -76,21 +87,35 @@ public class DocumentDataMapper {
default -> new HashMap<>();
};
return DocumentStructure.EntryData.builder()
DocumentStructure.EntryData.EntryDataBuilder documentBuilder = DocumentStructure.EntryData.builder()
.treeId(toPrimitiveIntArray(entry.getTreeId()))
.children(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList())
.children(entry.getChildren()
.stream()
.map(DocumentDataMapper::toEntryData)
.toList())
.type(entry.getType())
.engines(entry.getNode().getEngines())
.atomicBlockIds(atomicTextBlocks)
.pageNumbers(entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new))
.properties(properties)
.build();
.pageNumbers(entry.getNode().getPages()
.stream()
.map(Page::getNumber)
.map(Integer::longValue)
.toArray(Long[]::new))
.properties(properties);
if (entry.getNode() != null) {
documentBuilder.engines(entry.getNode().getEngines());
} else {
documentBuilder.engines(new HashSet<>(Set.of(LayoutEngine.ALGORITHM)));
}
return documentBuilder.build();
}
private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
return textBlock.getAtomicTextBlocks()
.stream()
.map(AtomicTextBlock::getId)
.toArray(Long[]::new);
}
@ -142,7 +167,9 @@ public class DocumentDataMapper {
private int[] toPrimitiveIntArray(List<Integer> list) {
return list.stream().mapToInt(Integer::intValue).toArray();
return list.stream()
.mapToInt(Integer::intValue)
.toArray();
}
}