RED-7074: Design Subsection section tree structure algorithm

* fix all failing tests
This commit is contained in:
maverickstuder 2024-05-15 16:40:57 +02:00
parent 49f13d1f03
commit b50bfed69d
7 changed files with 116 additions and 75 deletions

View File

@ -254,7 +254,9 @@ public class LayoutParsingPipeline {
OutlineObject lastProcessedOutlineObject = null;
// parsing the structure elements could be useful as well
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
}
long pageCount = originDocument.getNumberOfPages();
@ -330,16 +332,18 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
notFoundOutlineObject = lastProcessedOutlineObject;
}
if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects);
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
notFoundOutlineObject = lastProcessedOutlineObject;
}
if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects);
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
}
}
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);

View File

@ -12,6 +12,7 @@ import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@Deprecated
public class ClassificationSection {
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.ToString;
import lombok.experimental.SuperBuilder;
@Data
@ -9,4 +10,10 @@ import lombok.experimental.SuperBuilder;
@EqualsAndHashCode(callSuper = true)
public class SuperSection extends Section {
@Override
public String toString() {
return super.toString();
}
}

View File

@ -59,11 +59,6 @@ public class DocumentGraphFactory {
document.getPages()
.forEach(context::buildAndAddPageWithCounter);
document.getSections()
.stream()
.flatMap(section -> section.getImages()
.stream())
.forEach(image -> context.getImages().add(image));
addSections(layoutParsingType, document, context, documentGraph);
addHeaderAndFooterToEachPage(document, context);

View File

@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;

View File

@ -31,7 +31,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/syngenta/CustomerFiles/90 Trinexapac-ethyl - Peer Review Report Syngenta - March 2018.pdf";
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";

View File

@ -37,8 +37,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
@ -62,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
tableServiceResponse,
new VisualLayoutParsingResponse(),
Map.of("file", "document"));
}
@ -134,6 +133,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@Test
@SneakyThrows
public void testTableAndCellRotations() {
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
@ -141,7 +141,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Disabled
@Test
public void testScanRotationBorderIsIgnored() throws IOException {
@ -151,15 +150,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
var tables = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList();
// Quality of the table parsing is not good, because the file is rotated at scanning.
@ -199,15 +202,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
@ -225,23 +232,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections()
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
@ -266,23 +279,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
TablePageBlock secondTable = document.getSections()
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
@ -307,23 +326,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections()
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
@ -818,10 +843,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
private void toHtml(ClassificationDocument document, String filename) {
var tables = document.getSections()
var tables = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList();
StringBuilder sb = new StringBuilder();
@ -843,12 +870,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections()
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream()
.flatMap(List::stream)
@ -870,10 +900,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
TablePageBlock table = document.getSections()
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
@ -896,10 +928,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTableSize(ClassificationDocument document, int tableSize) {
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.toList().size()).isEqualTo(tableSize);
}