diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index b46fa23..638ac75 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -160,12 +160,8 @@ public class DocumentGraphFactory { private void addFooter(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()) - .build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), - footer, - context, - page); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); footer.setLeafTextBlock(textBlock); @@ -176,9 +172,8 @@ public class DocumentGraphFactory { public void addHeader(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Header header = Header.builder().documentTree(context.getDocumentTree()) - .build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page); + Header header = Header.builder().documentTree(context.getDocumentTree()).build(); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); header.setLeafTextBlock(textBlock); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 53e8c29..bfd30c6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -23,4 +23,10 @@ public class TextPositionOperations { return sequence; } + + public static List mergeTextPositionSequence(List textBlocks) { + + return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList()); + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 18e0e8c..0ea29b4 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.server.segmentation; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.awt.geom.Rectangle2D; import java.io.File; @@ -25,14 +26,20 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; import lombok.SneakyThrows; @@ -751,6 +758,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test public void testMergedEntities_Page26() throws IOException { @@ -765,6 +773,40 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test + public void testHeaderAndFooter() throws IOException { + + String fileName = "files/SinglePages/Page1_54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf"; + String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013"; + ClassPathResource pdfFileResource = new ClassPathResource(fileName); + + List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); + var textPositions = textPositionPerPage.stream() + .flatMap(t -> t.getSortedTextPositionSequences() + .stream() + .map(TextPositionSequence::toString)) + .collect(Collectors.joining(" ")); + assertThat(textPositions.contains(textToSearch)).isFalse(); + + ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile()); + + assertThat(classificationDocument.getHeaders() + .get(0).getTextBlocks().size()).isEqualTo(3); + assertThat(classificationDocument.getHeaders() + .get(0).getTextBlocks() + .get(0).getSequences().size()).isEqualTo(8); + assertThat(classificationDocument.getHeaders() + .get(0).getTextBlocks() + .get(0).toString()).isEqualTo(textToSearch); + + Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); + + TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock(); + assertTrue(leafTextBlock.getSearchText().contains(textToSearch)); + + } + + @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Page1_54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Page1_54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf new file mode 100644 index 0000000..d8d947b Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Page1_54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf differ