RED-8992 - Enable to add annotation on header with line breaks
- don't reorder textblocks classified as headers and footers - add unit test
This commit is contained in:
parent
84bdb4d1ed
commit
4e7c3f584b
@ -180,7 +180,7 @@ public class DocumentGraphFactory {
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
@ -196,7 +196,7 @@ public class DocumentGraphFactory {
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
header.setLeafTextBlock(textBlock);
|
||||
|
||||
@ -22,4 +22,9 @@ public class TextPositionOperations {
|
||||
return sequence;
|
||||
}
|
||||
|
||||
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
|
||||
|
||||
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -25,15 +25,21 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -103,6 +109,40 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testHeaderAndFooter() throws IOException {
|
||||
|
||||
String fileName = "files/syngenta/CustomerFiles/54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
|
||||
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
var textPositions = textPositionPerPage.stream()
|
||||
.flatMap(t -> t.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.map(TextPositionSequence::toString))
|
||||
.collect(Collectors.joining(" "));
|
||||
assertThat(textPositions.contains(textToSearch)).isFalse();
|
||||
|
||||
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks().size()).isEqualTo(3);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).getSequences().size()).isEqualTo(8);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).toString()).isEqualTo(textToSearch);
|
||||
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||
|
||||
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
|
||||
assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||
@ -112,7 +152,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
|
||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user