diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 7921d37..de14ff4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -53,6 +53,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; @@ -268,7 +269,7 @@ public class LayoutParsingPipeline { TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); - var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false); + List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) .addAll(graphics.stream() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 87db5dc..0fd096b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -164,10 +164,11 @@ public class RedactManagerBlockificationService { previous = block; } - - visualizations.addTextBlockVisualizations(chunkBlockList.stream() - .map(tb -> (TextPageBlock) tb) - .toList(), textPositions.get(0).getPage()); + if (!textPositions.isEmpty()) { + visualizations.addTextBlockVisualizations(chunkBlockList.stream() + .map(tb -> (TextPageBlock) tb) + .toList(), textPositions.get(0).getPage()); + } return new ClassificationPage(chunkBlockList); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index d04af07..13db7b4 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Test public void testLayoutParserEndToEnd() { - String filePath = "/home/kschuettler/Dokumente/TestFiles/RotateTextWithRulingsTestFile.pdf"; + String filePath = "files/syngenta/CustomerFiles/54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf"; runForFile(filePath); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index b996124..7ea3b46 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -52,28 +52,16 @@ public class PdfSegmentationServiceTest extends AbstractTest { @Autowired private ObjectMapper objectMapper; - @Autowired - private RedactManagerClassificationService redactManagerClassificationService; - - @Autowired - private SectionsBuilderService sectionsBuilderService; - @SneakyThrows public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) { - ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - originDocument, - new ImageServiceResponse(), - tableServiceResponse, - new VisualLayoutParsingResponse(), - Map.of("file","document")); - - redactManagerClassificationService.classifyDocument(classificationDocument); - - sectionsBuilderService.buildSections(classificationDocument); - - return classificationDocument; + return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + originDocument, + new ImageServiceResponse(), + tableServiceResponse, + new VisualLayoutParsingResponse(), + Map.of("file", "document")); } @@ -127,13 +115,13 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile()); assertThat(classificationDocument.getHeaders() - .get(0).getTextBlocks().size()).isEqualTo(3); + .get(0).getTextBlocks().size()).isEqualTo(1); assertThat(classificationDocument.getHeaders() .get(0).getTextBlocks() - .get(0).getSequences().size()).isEqualTo(8); + .get(0).getSequences().size()).isEqualTo(12); assertThat(classificationDocument.getHeaders() .get(0).getTextBlocks() - .get(0).toString()).isEqualTo(textToSearch); + .get(0).toString()).contains(textToSearch); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); @@ -157,7 +145,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { .flatMap(paragraph -> paragraph.getTables() .stream()) .collect(Collectors.toList())).isNotEmpty(); - var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); + var tables = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList(); // Quality of the table parsing is not good, because the file is rotated at scanning. // We only asset that the table border is not the page border. @@ -179,12 +171,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { imageServiceResponse.getData() .forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), - imageMetadata.getPosition().getY1(), - imageMetadata.getGeometry().getWidth(), - imageMetadata.getGeometry().getHeight()), - ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), - imageMetadata.isAlpha(), - imageMetadata.getPosition().getPageNumber()))); + imageMetadata.getPosition().getY1(), + imageMetadata.getGeometry().getWidth(), + imageMetadata.getGeometry().getHeight()), + ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), + imageMetadata.isAlpha(), + imageMetadata.getPosition().getPageNumber()))); System.out.println("object"); } @@ -196,11 +188,22 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); - TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .collect(Collectors.toList())).isNotEmpty(); + TablePageBlock table = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(0); assertThat(table.getColCount()).isEqualTo(6); assertThat(table.getRowCount()).isEqualTo(13); - assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); + assertThat(table.getRows() + .stream() + .mapToInt(List::size).sum()).isEqualTo(6 * 13); } @@ -373,29 +376,30 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTable(document, 0, 8, 8, 0, 0); List> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR", - "Author, date", - "Study title", - "Analytical method Author, date, No.", - "Technique, LOQ of the method, validated working range", - "Method meets analytical validation criteria", - "Remarks (in case validation criteria are not met)", - "Acceptability of the method"), - Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), - Arrays.asList("CA 7.1.2.1.1 DAR (2009)", - "Evans P.G. 2001 TMJ4569B, VV-323245", - "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", - "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", - "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", - "Y", - "N/A", - "Y")); + "Author, date", + "Study title", + "Analytical method Author, date, No.", + "Technique, LOQ of the method, validated working range", + "Method meets analytical validation criteria", + "Remarks (in case validation criteria are not met)", + "Acceptability of the method"), + Arrays.asList( + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), + Arrays.asList("CA 7.1.2.1.1 DAR (2009)", + "Evans P.G. 2001 TMJ4569B, VV-323245", + "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", + "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", + "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", + "Y", + "N/A", + "Y")); validateTable(document, 0, values); @@ -785,6 +789,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test public void testMergedEntities_Page26() throws IOException { @@ -802,7 +807,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { - var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); + var tables = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList(); StringBuilder sb = new StringBuilder(); int currentPage = 1; @@ -823,9 +832,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { - TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); + TablePageBlock table = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(tableIndex); List> rows = table.getRows(); - int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size(); + int emptyCellsFoundFound = rows.stream() + .flatMap(List::stream) + .toList() + .stream() + .filter(f -> f.toString().isEmpty()) + .toList().size(); for (List row : table.getRows()) { row.forEach(r -> System.out.println(r.toString())); @@ -840,11 +859,20 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, List> values) { - TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); + TablePageBlock table = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(tableIndex); List> rows = table.getRows(); - List rowsFlattened = rows.stream().flatMap(List::stream).toList(); - List valuesFlattened = values.stream().flatMap(List::stream).toList(); + List rowsFlattened = rows.stream() + .flatMap(List::stream) + .toList(); + List valuesFlattened = values.stream() + .flatMap(List::stream) + .toList(); for (int i = 0; i < valuesFlattened.size(); i++) { Cell cell = rowsFlattened.get(i); @@ -857,7 +885,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTableSize(ClassificationDocument document, int tableSize) { - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList().size()).isEqualTo(tableSize); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 467b6c0..631f643 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -81,6 +81,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { @Test + @Disabled @SneakyThrows public void testTableExtraction() { @@ -97,6 +98,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { } + @SneakyThrows private void writeJsons(Path filename) {