Merge branch 'RED-9524' into 'main'

RED-9524: File processing does not annotate images

See merge request fforesight/layout-parser!214
This commit is contained in:
Maverick Studer 2024-09-04 13:27:06 +02:00
commit b66afe135c

View File

@ -65,7 +65,7 @@ public class DocumentGraphFactory {
document.getPages()
.forEach(context::buildAndAddPageWithCounter);
addSections(layoutParsingType, document, context, documentGraph);
addSectionsAndImagesForToC(layoutParsingType, document, context, documentGraph);
addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size());
@ -92,7 +92,22 @@ public class DocumentGraphFactory {
}
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
private void addSectionsAndImagesForToC(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
// when no main sections are present, but we have images, i.e. in a document without any text
if (classificationDocument.getTableOfContents().getMainSections().isEmpty()) {
List<ClassifiedImage> images = classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getImages()
.stream())
.toList();
if (!images.isEmpty()) {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
}
}
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
@ -260,7 +275,6 @@ public class DocumentGraphFactory {
DocumentTree documentTree;
Map<Page, Integer> pages;
List<AbstractSemanticNode> sections;
List<ClassifiedImage> images;
TextBlockFactory textBlockFactory;
@ -269,7 +283,6 @@ public class DocumentGraphFactory {
documentTree = new DocumentTree(document);
pages = new HashMap<>();
sections = new LinkedList<>();
images = new LinkedList<>();
textBlockFactory = new TextBlockFactory();
}