Merge branch 'RED-7074_2' into 'main'

RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!160
This commit is contained in:
Maverick Studer 2024-05-28 14:48:21 +02:00
commit b6742c1e89
7 changed files with 7 additions and 19 deletions

View File

@ -43,9 +43,9 @@ public class Document extends AbstractSemanticNode {
}
public List<Section> getMainSections() {
public List<Section> getAllSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}

View File

@ -82,9 +82,7 @@ public class TOCEnrichmentService {
if(iterator.hasNext()) {
currentTOCItem = iterator.next();
}
}
if (!foundFirstHeadline) {
} else if (!foundFirstHeadline) {
startBlocks.add(current);
} else {
currentSection.getSectionBlocks().add(current);

View File

@ -68,7 +68,7 @@ public class RedTextPosition extends BoundingBox {
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight()));
float textHeight = textPosition.getHeight() + HEIGHT_PADDING;
float textHeight = textPosition.getHeight() + 2 * HEIGHT_PADDING;
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),

View File

@ -9,7 +9,6 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import lombok.extern.slf4j.Slf4j;
@ -20,7 +19,7 @@ public class SimplifiedSectionTextService {
public SimplifiedText toSimplifiedText(Document document) {
List<SimplifiedSectionText> simplifiedMainSectionsList = document.getMainSections()
List<SimplifiedSectionText> simplifiedMainSectionsList = document.getAllSections()
.stream()
.map(this::toSimplifiedSectionText)
.toList();

View File

@ -69,11 +69,6 @@ public class SectionNodeFactory {
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
if (containsTablesAndTextBlocks(pageBlocks)) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
section,
true,
@ -82,10 +77,6 @@ public class SectionNodeFactory {
context,
document));
} else if (!isLeaf) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
} else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);

View File

@ -31,7 +31,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/syngenta/CustomerFiles/SinglePages/S4_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";

View File

@ -114,7 +114,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks().size()).isEqualTo(3);
.get(0).getTextBlocks().size()).isEqualTo(2);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).getSequences().size()).isEqualTo(8);