RED-7074: Design Subsection section tree structure algorithm
* fix all failing tests
This commit is contained in:
parent
49f13d1f03
commit
b50bfed69d
@ -254,7 +254,9 @@ public class LayoutParsingPipeline {
|
||||
OutlineObject lastProcessedOutlineObject = null;
|
||||
|
||||
// parsing the structure elements could be useful as well
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
}
|
||||
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
@ -330,16 +332,18 @@ public class LayoutParsingPipeline {
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||
|
||||
OutlineObject notFoundOutlineObject = null;
|
||||
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
|
||||
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||
}
|
||||
if (!outlineObjects.isEmpty()) {
|
||||
classificationPage.setOutlineObjects(outlineObjects);
|
||||
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||
OutlineObject notFoundOutlineObject = null;
|
||||
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
|
||||
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||
}
|
||||
if (!outlineObjects.isEmpty()) {
|
||||
classificationPage.setOutlineObjects(outlineObjects);
|
||||
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||
}
|
||||
}
|
||||
|
||||
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||
|
||||
@ -12,6 +12,7 @@ import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@Deprecated
|
||||
public class ClassificationSection {
|
||||
|
||||
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.ToString;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@ -9,4 +10,10 @@ import lombok.experimental.SuperBuilder;
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class SuperSection extends Section {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return super.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -59,11 +59,6 @@ public class DocumentGraphFactory {
|
||||
|
||||
document.getPages()
|
||||
.forEach(context::buildAndAddPageWithCounter);
|
||||
document.getSections()
|
||||
.stream()
|
||||
.flatMap(section -> section.getImages()
|
||||
.stream())
|
||||
.forEach(image -> context.getImages().add(image));
|
||||
addSections(layoutParsingType, document, context, documentGraph);
|
||||
addHeaderAndFooterToEachPage(document, context);
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
|
||||
@ -31,7 +31,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/syngenta/CustomerFiles/90 Trinexapac-ethyl - Peer Review Report Syngenta - March 2018.pdf";
|
||||
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
|
||||
@ -37,8 +37,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
@ -62,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
tableServiceResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", "document"));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -134,6 +133,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTableAndCellRotations() {
|
||||
|
||||
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
|
||||
@ -141,7 +141,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||
@ -151,15 +150,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
var tables = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList();
|
||||
|
||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||
@ -199,15 +202,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
@ -225,23 +232,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
@ -266,23 +279,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
@ -307,23 +326,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
@ -818,10 +843,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
private void toHtml(ClassificationDocument document, String filename) {
|
||||
|
||||
var tables = document.getSections()
|
||||
var tables = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
@ -843,12 +870,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
|
||||
TablePageBlock table = document.getSections()
|
||||
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
@ -870,10 +900,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||
|
||||
TablePageBlock table = document.getSections()
|
||||
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
@ -896,10 +928,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.toList().size()).isEqualTo(tableSize);
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user