diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 3cfe5e5..1481776 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -180,7 +180,7 @@ public class DocumentGraphFactory { Page page = context.getPage(textBlocks.get(0).getPage()); Footer footer = Footer.builder().documentTree(context.getDocumentTree()) .build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page); @@ -196,7 +196,7 @@ public class DocumentGraphFactory { Page page = context.getPage(textBlocks.get(0).getPage()); Header header = Header.builder().documentTree(context.getDocumentTree()) .build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); header.setLeafTextBlock(textBlock); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 9927685..5ab1843 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -22,4 +22,9 @@ public class TextPositionOperations { return sequence; } + public static List mergeTextPositionSequence(List textBlocks) { + + return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList()); + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 8c6ea4e..4cb70fc 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -25,15 +25,21 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; import lombok.SneakyThrows; @@ -103,6 +109,40 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test + public void testHeaderAndFooter() throws IOException { + + String fileName = "files/syngenta/CustomerFiles/54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf"; + String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013"; + ClassPathResource pdfFileResource = new ClassPathResource(fileName); + + List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); + var textPositions = textPositionPerPage.stream() + .flatMap(t -> t.getSortedTextPositionSequences() + .stream() + .map(TextPositionSequence::toString)) + .collect(Collectors.joining(" ")); + assertThat(textPositions.contains(textToSearch)).isFalse(); + + ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile()); + + assertThat(classificationDocument.getHeaders() + .get(0).getTextBlocks().size()).isEqualTo(3); + assertThat(classificationDocument.getHeaders() + .get(0).getTextBlocks() + .get(0).getSequences().size()).isEqualTo(8); + assertThat(classificationDocument.getHeaders() + .get(0).getTextBlocks() + .get(0).toString()).isEqualTo(textToSearch); + + Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); + + TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock(); + assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue(); + + } + + @Disabled @Test public void testScanRotationBorderIsIgnored() throws IOException { @@ -112,7 +152,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse); - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .collect(Collectors.toList())).isNotEmpty(); var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); // Quality of the table parsing is not good, because the file is rotated at scanning.