RED-8992 - Enable to add annotation on header with line breaks
- don't reorder textblocks classified as headers and footers - add unit test
This commit is contained in:
parent
84bdb4d1ed
commit
4e7c3f584b
@ -180,7 +180,7 @@ public class DocumentGraphFactory {
|
|||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||||
.build();
|
.build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
|
||||||
footer,
|
footer,
|
||||||
context,
|
context,
|
||||||
page);
|
page);
|
||||||
@ -196,7 +196,7 @@ public class DocumentGraphFactory {
|
|||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||||
.build();
|
.build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||||
header.setTreeId(tocId);
|
header.setTreeId(tocId);
|
||||||
header.setLeafTextBlock(textBlock);
|
header.setLeafTextBlock(textBlock);
|
||||||
|
|||||||
@ -22,4 +22,9 @@ public class TextPositionOperations {
|
|||||||
return sequence;
|
return sequence;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
|
||||||
|
|
||||||
|
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -25,15 +25,21 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -103,6 +109,40 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHeaderAndFooter() throws IOException {
|
||||||
|
|
||||||
|
String fileName = "files/syngenta/CustomerFiles/54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
|
||||||
|
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||||
|
|
||||||
|
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
var textPositions = textPositionPerPage.stream()
|
||||||
|
.flatMap(t -> t.getSortedTextPositionSequences()
|
||||||
|
.stream()
|
||||||
|
.map(TextPositionSequence::toString))
|
||||||
|
.collect(Collectors.joining(" "));
|
||||||
|
assertThat(textPositions.contains(textToSearch)).isFalse();
|
||||||
|
|
||||||
|
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
assertThat(classificationDocument.getHeaders()
|
||||||
|
.get(0).getTextBlocks().size()).isEqualTo(3);
|
||||||
|
assertThat(classificationDocument.getHeaders()
|
||||||
|
.get(0).getTextBlocks()
|
||||||
|
.get(0).getSequences().size()).isEqualTo(8);
|
||||||
|
assertThat(classificationDocument.getHeaders()
|
||||||
|
.get(0).getTextBlocks()
|
||||||
|
.get(0).toString()).isEqualTo(textToSearch);
|
||||||
|
|
||||||
|
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||||
|
|
||||||
|
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
|
||||||
|
assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Disabled
|
@Disabled
|
||||||
@Test
|
@Test
|
||||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||||
@ -112,7 +152,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.collect(Collectors.toList())).isNotEmpty();
|
||||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||||
|
|
||||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user