Merge branch 'RED-8992-bp' into 'release/0.89.x'

RED-8992: Enable to add annotation on header with line breaks

See merge request fforesight/layout-parser!142
This commit is contained in:
Corina Olariu 2024-04-24 13:51:31 +02:00
commit 45ff220d83
4 changed files with 52 additions and 9 deletions

View File

@ -160,12 +160,8 @@ public class DocumentGraphFactory {
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
footer,
context,
page);
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
@ -176,9 +172,8 @@ public class DocumentGraphFactory {
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
header.setLeafTextBlock(textBlock);

View File

@ -23,4 +23,10 @@ public class TextPositionOperations {
return sequence;
}
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
}
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.server.segmentation;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.awt.geom.Rectangle2D;
import java.io.File;
@ -25,14 +26,20 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import lombok.SneakyThrows;
@ -751,6 +758,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
public void testMergedEntities_Page26() throws IOException {
@ -765,6 +773,40 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
public void testHeaderAndFooter() throws IOException {
String fileName = "files/SinglePages/Page1_54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
var textPositions = textPositionPerPage.stream()
.flatMap(t -> t.getSortedTextPositionSequences()
.stream()
.map(TextPositionSequence::toString))
.collect(Collectors.joining(" "));
assertThat(textPositions.contains(textToSearch)).isFalse();
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks().size()).isEqualTo(3);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).getSequences().size()).isEqualTo(8);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).toString()).isEqualTo(textToSearch);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
assertTrue(leafTextBlock.getSearchText().contains(textToSearch));
}
@SneakyThrows
private void toHtml(ClassificationDocument document, String filename) {