RED-8992 - Enable to add annotation on header with line breaks

- don't reorder textblocks classified as headers and footers
- add unit test
This commit is contained in:
Corina Olariu 2024-04-25 11:23:10 +03:00
parent 84bdb4d1ed
commit 4e7c3f584b
3 changed files with 52 additions and 3 deletions

View File

@ -180,7 +180,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
footer,
context,
page);
@ -196,7 +196,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
header.setLeafTextBlock(textBlock);

View File

@ -22,4 +22,9 @@ public class TextPositionOperations {
return sequence;
}
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
}
}

View File

@ -25,15 +25,21 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import lombok.SneakyThrows;
@ -103,6 +109,40 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
public void testHeaderAndFooter() throws IOException {
String fileName = "files/syngenta/CustomerFiles/54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
var textPositions = textPositionPerPage.stream()
.flatMap(t -> t.getSortedTextPositionSequences()
.stream()
.map(TextPositionSequence::toString))
.collect(Collectors.joining(" "));
assertThat(textPositions.contains(textToSearch)).isFalse();
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks().size()).isEqualTo(3);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).getSequences().size()).isEqualTo(8);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).toString()).isEqualTo(textToSearch);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue();
}
@Disabled
@Test
public void testScanRotationBorderIsIgnored() throws IOException {
@ -112,7 +152,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
// Quality of the table parsing is not good, because the file is rotated at scanning.