Merge branch 'RED-8992-bp' into 'release/0.89.x'
RED-8992: Enable to add annotation on header with line breaks See merge request fforesight/layout-parser!142
This commit is contained in:
commit
45ff220d83
@ -160,12 +160,8 @@ public class DocumentGraphFactory {
|
||||
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
@ -176,9 +172,8 @@ public class DocumentGraphFactory {
|
||||
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
header.setLeafTextBlock(textBlock);
|
||||
|
||||
@ -23,4 +23,10 @@ public class TextPositionOperations {
|
||||
return sequence;
|
||||
}
|
||||
|
||||
|
||||
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
|
||||
|
||||
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.segmentation;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
@ -25,14 +26,20 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -751,6 +758,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testMergedEntities_Page26() throws IOException {
|
||||
|
||||
@ -765,6 +773,40 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testHeaderAndFooter() throws IOException {
|
||||
|
||||
String fileName = "files/SinglePages/Page1_54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
|
||||
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
var textPositions = textPositionPerPage.stream()
|
||||
.flatMap(t -> t.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.map(TextPositionSequence::toString))
|
||||
.collect(Collectors.joining(" "));
|
||||
assertThat(textPositions.contains(textToSearch)).isFalse();
|
||||
|
||||
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks().size()).isEqualTo(3);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).getSequences().size()).isEqualTo(8);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).toString()).isEqualTo(textToSearch);
|
||||
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
|
||||
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
|
||||
assertTrue(leafTextBlock.getSearchText().contains(textToSearch));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void toHtml(ClassificationDocument document, String filename) {
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user