RED-7074: Design Subsection section tree structure algorithm

This commit is contained in:
Maverick Studer 2024-05-28 14:48:21 +02:00
parent 23985b14be
commit efb1a748af
7 changed files with 7 additions and 19 deletions

View File

@ -43,9 +43,9 @@ public class Document extends AbstractSemanticNode {
} }
public List<Section> getMainSections() { public List<Section> getAllSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node) return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -82,9 +82,7 @@ public class TOCEnrichmentService {
if(iterator.hasNext()) { if(iterator.hasNext()) {
currentTOCItem = iterator.next(); currentTOCItem = iterator.next();
} }
} } else if (!foundFirstHeadline) {
if (!foundFirstHeadline) {
startBlocks.add(current); startBlocks.add(current);
} else { } else {
currentSection.getSectionBlocks().add(current); currentSection.getSectionBlocks().add(current);

View File

@ -68,7 +68,7 @@ public class RedTextPosition extends BoundingBox {
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work. // I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight())); pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight()));
float textHeight = textPosition.getHeight() + HEIGHT_PADDING; float textHeight = textPosition.getHeight() + 2 * HEIGHT_PADDING;
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(), Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight, textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(), textPosition.getWidthDirAdj(),

View File

@ -9,7 +9,6 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -20,7 +19,7 @@ public class SimplifiedSectionTextService {
public SimplifiedText toSimplifiedText(Document document) { public SimplifiedText toSimplifiedText(Document document) {
List<SimplifiedSectionText> simplifiedMainSectionsList = document.getMainSections() List<SimplifiedSectionText> simplifiedMainSectionsList = document.getAllSections()
.stream() .stream()
.map(this::toSimplifiedSectionText) .map(this::toSimplifiedSectionText)
.toList(); .toList();

View File

@ -69,11 +69,6 @@ public class SectionNodeFactory {
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document); addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
if (containsTablesAndTextBlocks(pageBlocks)) { if (containsTablesAndTextBlocks(pageBlocks)) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
section, section,
true, true,
@ -82,10 +77,6 @@ public class SectionNodeFactory {
context, context,
document)); document));
} else if (!isLeaf) { } else if (!isLeaf) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document); addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
} else { } else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document); addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);

View File

@ -31,7 +31,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String fileName = "files/syngenta/CustomerFiles/SinglePages/S4_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";

View File

@ -114,7 +114,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile()); ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
assertThat(classificationDocument.getHeaders() assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks().size()).isEqualTo(3); .get(0).getTextBlocks().size()).isEqualTo(2);
assertThat(classificationDocument.getHeaders() assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks() .get(0).getTextBlocks()
.get(0).getSequences().size()).isEqualTo(8); .get(0).getSequences().size()).isEqualTo(8);