RED-7074: Design Subsection section tree structure algorithm

* temp
This commit is contained in:
maverickstuder 2024-05-07 14:25:54 +02:00
parent f7aeb9a406
commit d2dc369df3
5 changed files with 38 additions and 16 deletions

View File

@ -1,8 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@ -42,6 +44,10 @@ public class Section implements GenericSemanticNode {
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@EqualsAndHashCode.Exclude
boolean isMainSection;
@Override
public NodeType getType() {
@ -56,6 +62,11 @@ public class Section implements GenericSemanticNode {
.isPresent();
}
public boolean isLeafSection() {
return streamAllSubNodesOfType(NodeType.SECTION).findAny()
.isEmpty();
}
@Override
public TextBlock getTextBlock() {

View File

@ -50,14 +50,17 @@ public class ConcatenatedTextBlock implements TextBlock {
public ConcatenatedTextBlock concat(TextBlock textBlock) {
int start = textBlock.getBoundary().start();
int end = textBlock.getBoundary().end();
if (this.atomicTextBlocks.isEmpty()) {
boundary.setStart(textBlock.getBoundary().start());
boundary.setEnd(textBlock.getBoundary().end());
} else if (boundary.end() != textBlock.getBoundary().start()) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
boundary.setStart(start);
boundary.setEnd(end);
} else if (boundary.end() != start) {
//throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
return this;
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
boundary.setEnd(textBlock.getBoundary().end());
boundary.setEnd(end);
this.searchText = null;
return this;
}

View File

@ -51,7 +51,8 @@ public class OutlineExtractorService {
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
if (documentOutline != null) {
for (PDOutlineItem child : documentOutline.children()) {
rootNodes.add(createOutlineObjectWithChildren(child, document, 1));
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
outlineObjectWithChildren.ifPresent(rootNodes::add);
}
}
@ -60,12 +61,14 @@ public class OutlineExtractorService {
@SneakyThrows
private OutlineObjectTreeNode createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
OutlineObjectTreeNode outlineObject = createOutlineObject(item, document, depth);
for (var child : item.children()) {
OutlineObjectTreeNode outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
outlineObject.addChild(outlineObjectWithChildren);
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
if (outlineObject.isPresent()) {
for (var child : item.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
}
}
return outlineObject;
@ -75,11 +78,14 @@ public class OutlineExtractorService {
// if the structure elements are processed beforehand, another case can be handled here as well:
// outline objects can reference structure elements (see pdf documentation)
@SneakyThrows
private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
String title = item.getTitle();
PDPage page = item.findDestinationPage(document);
if (page == null) {
return Optional.empty();
}
int pageNumber = document.getPages().indexOf(page);
Optional<Point2D> outlinePosition = Optional.empty();
@ -109,7 +115,7 @@ public class OutlineExtractorService {
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
}
return new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth));
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth)));
}

View File

@ -50,8 +50,7 @@ public class SectionNodeFactory {
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree())
.build();
Section section = Section.builder().isMainSection(parentNode == null).documentTree(context.getDocumentTree()).build();
context.getSections().add(section);
blocksPerPage.keySet()

View File

@ -29,6 +29,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@Test
@SneakyThrows
@Disabled
public void testViewerDocuments() {
String directory = "files/syngenta_190_deduplicated/";
@ -81,7 +82,9 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/documine/20_TiltPlus_SensibilizacaoCutanea.pdf";
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
//String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf";
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
//String fileName = "files/new/kaust-official-thesis-template.pdf";