diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 9c65a49..42da329 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.image.Classifi import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; @@ -101,6 +102,7 @@ public class LayoutParsingPipeline { GraphicExtractorService graphicExtractorService; OutlineExtractorService outlineExtractorService; OutlineValidationService outlineValidationService; + TOCEnrichmentService tocEnrichmentService; public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { @@ -279,17 +281,17 @@ public class LayoutParsingPipeline { List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, - pdPage, - pageNumber, - cleanRulings, - stripper.getTextPositionSequences(), - emptyTableCells, - false); + pdPage, + pageNumber, + cleanRulings, + stripper.getTextPositionSequences(), + emptyTableCells, + false); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) .addAll(graphics.stream() - .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber())) - .toList()); + .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber())) + .toList()); ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); @@ -372,6 +374,8 @@ public class LayoutParsingPipeline { default -> { sectionsBuilderService.buildSections(classificationDocument); sectionsBuilderService.addImagesToSections(classificationDocument); + + tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index e6ef1ad..54b8f81 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -1,11 +1,15 @@ package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import lombok.Data; @@ -17,6 +21,7 @@ public class ClassificationDocument { private List pages = new ArrayList<>(); private List sections = new ArrayList<>(); + //private Map> sectionsMap = new HashMap<>(); private List headers = new ArrayList<>(); private List footers = new ArrayList<>(); private List unclassifiedTexts = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java index dff1f4b..a95ee58 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java @@ -140,8 +140,8 @@ public class DocumentTree { if (treeId.isEmpty()) { return root; } - Entry entry = root.children.get(treeId.get(0)); - for (int id : treeId.subList(1, treeId.size())) { + Entry entry = root; + for (int id : treeId) { entry = entry.children.get(id); } return entry; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java index d52ba5e..a1d5838 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java @@ -39,7 +39,7 @@ public class OutlineValidationService { private boolean containsBlock(TableOfContents toc, TextPageBlock block) { for (TableOfContentItem existingItem : toc.getMainSections()) { - if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) { + if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) { return true; } } @@ -82,7 +82,7 @@ public class OutlineValidationService { assert (parent != null); while (parentDepth < currentDepth && parent.getParent() != null) { parent = parent.getParent(); - parentDepth = getDepth(parent.getTextPageBlock().getClassification()); + parentDepth = getDepth(parent.getHeadline().getClassification()); } parent.addChild(new TableOfContentItem(current)); } @@ -110,12 +110,12 @@ public class OutlineValidationService { } else { assert last != null; - int lastDepth = getDepth(last.getTextPageBlock().getClassification()); + int lastDepth = getDepth(last.getHeadline().getClassification()); if (lastDepth < parentDepth) { parentDepth = lastDepth; } else if (lastDepth == currentDepth && last.getParent() != null) { - parentDepth = getDepth(last.getParent().getTextPageBlock().getClassification()); + parentDepth = getDepth(last.getParent().getHeadline().getClassification()); } TableOfContentItem parent = lastItemsPerDepth.get(parentDepth); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java new file mode 100644 index 0000000..f5aa06f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java @@ -0,0 +1,266 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +public class TOCEnrichmentService { + + public void assignSectionBlocksAndImages(ClassificationDocument document) { + + TableOfContents toc = document.getTableOfContents(); + List startBlocks = new ArrayList<>(); + List startImages = new ArrayList<>(); + //Map> sectionsMap = new HashMap<>(); + TableOfContentItem currentSection = null; + boolean foundFirstHeadline = false; + + //for (TableOfContentItem item : toc.getAllTableOfContentItems()) { + // sectionsMap.put(item, new ArrayList<>()); + //} + + List headers = new ArrayList<>(); + List footers = new ArrayList<>(); + TablePageBlock previousTable = null; + List lastFoundTOCItems = new ArrayList<>(); + + for (ClassificationPage page : document.getPages()) { + List currentPageTOCItems = new ArrayList<>(); + List header = new ArrayList<>(); + List footer = new ArrayList<>(); + for (AbstractPageBlock current : page.getTextBlocks()) { + + if (current.getClassification() == null) { + continue; + } + + current.setPage(page.getPageNumber()); + + if (current.getClassification().equals(PageBlockType.HEADER)) { + header.add((TextPageBlock) current); + continue; + } + + if (current.getClassification().equals(PageBlockType.FOOTER)) { + footer.add((TextPageBlock) current); + continue; + } + + if (current instanceof TablePageBlock table) { + if (previousTable != null) { + mergeTableMetadata(table, previousTable); + } + previousTable = table; + } + boolean matched = false; + + for (TableOfContentItem tocItem : toc) { + if (current instanceof TextPageBlock && tocItem.getHeadline().getText().equals(current.getText())) { + if (!foundFirstHeadline) { + foundFirstHeadline = true; + } + currentSection = tocItem; + //sectionsMap.get(tocItem).add(current); + tocItem.getSectionBlocks().add(current); + currentPageTOCItems.add(tocItem); + matched = true; + break; + } + } + + if (!matched) { + if (!foundFirstHeadline) { + startBlocks.add(current); + } else { + currentSection.getSectionBlocks().add(current); + //sectionsMap.get(currentSection).add(current); + } + } + } + + if (!currentPageTOCItems.isEmpty()) { + lastFoundTOCItems = currentPageTOCItems; + } + + for (ClassifiedImage image : page.getImages()) { + + Float xMin = null; + Float yMin = null; + Float xMax = null; + Float yMax = null; + + for (TableOfContentItem tocItem : lastFoundTOCItems) { + var headline = tocItem.getHeadline(); + + if (headline.getPage() != page.getPageNumber()) { + continue; + } + + if (headline.getMinX() < headline.getMaxX()) { + if (xMin == null || headline.getMinX() < xMin) { + xMin = headline.getMinX(); + } + if (xMax == null || headline.getMaxX() > xMax) { + xMax = headline.getMaxX(); + } + } else { + if (xMin == null || headline.getMaxX() < xMin) { + xMin = headline.getMaxX(); + } + if (xMax == null || headline.getMinX() > xMax) { + xMax = headline.getMinX(); + } + } + + if (headline.getMinY() < headline.getMaxY()) { + if (yMin == null || headline.getMinY() < yMin) { + yMin = headline.getMinY(); + } + if (yMax == null || headline.getMaxY() > yMax) { + yMax = headline.getMaxY(); + } + } else { + if (yMin == null || headline.getMaxY() < yMin) { + yMin = headline.getMaxY(); + } + if (yMax == null || headline.getMinY() > yMax) { + yMax = headline.getMinY(); + } + } + + log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY()); + log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax); + + if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) { + tocItem.getImages().add(image); + image.setAppendedToSection(true); + break; + } + } + if (!image.isAppendedToSection()) { + log.debug("Image uses first paragraph"); + if (!lastFoundTOCItems.isEmpty()) { + lastFoundTOCItems.get(0).getImages().add(image); + } else { + startImages.add(image); + } + image.setAppendedToSection(true); + } + } + + if (!header.isEmpty()) { + headers.add(new ClassificationHeader(header)); + } + if (!footer.isEmpty()) { + footers.add(new ClassificationFooter(footer)); + } + } + + if (!startBlocks.isEmpty()) { + TableOfContentItem unassigned = new TableOfContentItem(null); + unassigned.setSectionBlocks(startBlocks); + unassigned.setImages(startImages); + document.getTableOfContents().getMainSections().add(0, unassigned); + } + //document.setSectionsMap(sectionsMap); + document.setHeaders(headers); + document.setFooters(footers); + } + + + private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) { + + // Distribute header information for subsequent tables + if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) { + List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); + List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); + // Allow merging of tables if header row is separated from first logical non-header row + if (previousTableNonHeaderRow.isEmpty() + && previousTable.getRowCount() == 1 + && previousTable.getRows() + .get(0).size() == tableNonHeaderRow.size()) { + previousTableNonHeaderRow = previousTable.getRows() + .get(0) + .stream() + .map(cell -> { + Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]); + fakeCell.setHeaderCells(Collections.singletonList(cell)); + return fakeCell; + }) + .toList(); + } + if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { + for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + List row = currentTable.getRows() + .get(i); + if (row.size() == tableNonHeaderRow.size() && row.stream() + .allMatch(cell -> cell.getHeaderCells().isEmpty())) { + for (int j = 0; j < row.size(); j++) { + row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); + } + } + } + } + } + } + + + private boolean hasValidHeaderInformation(TablePageBlock table) { + + return !hasInvalidHeaderInformation(table); + } + + + private boolean hasInvalidHeaderInformation(TablePageBlock table) { + + return table.getRows() + .stream() + .flatMap(row -> row.stream() + .filter(cell -> !cell.getHeaderCells().isEmpty())) + .findAny().isEmpty(); + + } + + + private List getRowWithNonHeaderCells(TablePageBlock table) { + + for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + List row = table.getRows() + .get(i); + if (row.size() == 1) { + continue; + } + boolean allNonHeader = true; + for (Cell cell : row) { + if (cell.isHeaderCell()) { + allNonHeader = false; + break; + } + } + if (allNonHeader) { + return row; + } + } + + return Collections.emptyList(); + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java index 2d57844..bbbbeac 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java @@ -1,9 +1,13 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.Data; @@ -14,14 +18,19 @@ import lombok.EqualsAndHashCode; public class TableOfContentItem { @EqualsAndHashCode.Include - private TextPageBlock textPageBlock; + private TextPageBlock headline; private List children = new ArrayList<>(); private TableOfContentItem parent; + private List sectionBlocks = new ArrayList<>(); + private List images = new ArrayList<>(); - public TableOfContentItem(TextPageBlock textPageBlock) { + private Section section; - this.textPageBlock = textPageBlock; + + public TableOfContentItem(TextPageBlock headline) { + + this.headline = headline; } @@ -34,60 +43,68 @@ public class TableOfContentItem { public TableOfContentItem getSiblingBefore() { - try { - return parent.getChildren() - .get(parent.getChildren().indexOf(this) - 1); - } catch (IndexOutOfBoundsException indexOutOfBoundsException) { - return null; + if (parent != null) { + int index = parent.getChildren().indexOf(this); + if (index > 0) { + return parent.getChildren() + .get(index - 1); + } } + return null; } + + public TableOfContentItem getSiblingAfter() { - try { - return parent.getChildren() - .get(parent.getChildren().indexOf(this) + 1); - } catch (IndexOutOfBoundsException indexOutOfBoundsException) { - return null; + if (parent != null) { + int index = parent.getChildren().indexOf(this); + if (index >= 0 && index < parent.getChildren().size() - 1) { + return parent.getChildren() + .get(index + 1); + } } + return null; } public boolean contains(TextPageBlock block) { - boolean anyChildContains = false; - if (!children.isEmpty()) { - for (TableOfContentItem child : children) { - if (child.getTextPageBlock().equals(block)) { - return true; - } else { - anyChildContains = anyChildContains || child.contains(block); - } + if (headline.equals(block)) { + return true; + } + for (TableOfContentItem child : children) { + if (child.contains(block)) { + return true; } } - return anyChildContains; + return false; } public boolean contains(TableOfContentItem tocItem) { - boolean anyChildContains = false; - if (!children.isEmpty()) { - for (TableOfContentItem child : children) { - if (child.equals(tocItem)) { - return true; - } else { - anyChildContains = anyChildContains || child.contains(tocItem); - } + if (this.equals(tocItem)) { + return true; + } + for (TableOfContentItem child : children) { + if (child.contains(tocItem)) { + return true; } } - return anyChildContains; + return false; } + public List getNonEmptySectionBlocks() { + + return sectionBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList()); + } @Override public String toString() { - return "OutlineObjectTreeNode{" + "textPageBlock=" + textPageBlock + '}'; + return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}'; } + + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java index bcffa89..769e5ac 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java @@ -1,9 +1,11 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; import java.util.ArrayList; -import java.util.HashMap; +import java.util.Iterator; import java.util.List; -import java.util.Map; +import java.util.Stack; + +import org.springframework.lang.NonNull; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -12,7 +14,7 @@ import lombok.RequiredArgsConstructor; @Data @RequiredArgsConstructor -public class TableOfContents { +public class TableOfContents implements Iterable { private List mainSections = new ArrayList<>(); @@ -35,7 +37,7 @@ public class TableOfContents { private void collectTextPageBlocks(TableOfContentItem item, List textPageBlocks) { - textPageBlocks.add(item.getTextPageBlock()); + textPageBlocks.add(item.getHeadline()); for (TableOfContentItem child : item.getChildren()) { collectTextPageBlocks(child, textPageBlocks); } @@ -56,4 +58,40 @@ public class TableOfContents { } } + + @Override + public @NonNull Iterator iterator() { + + return new TableOfContentItemIterator(mainSections); + } + + private static class TableOfContentItemIterator implements Iterator { + private final Stack> stack = new Stack<>(); + + public TableOfContentItemIterator(List mainSections) { + stack.push(mainSections.iterator()); + } + + @Override + public boolean hasNext() { + ensureStackTopIsCurrent(); + return !stack.isEmpty() && stack.peek().hasNext(); + } + + @Override + public TableOfContentItem next() { + ensureStackTopIsCurrent(); + TableOfContentItem currentItem = stack.peek().next(); + if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) { + stack.push(currentItem.getChildren().iterator()); + } + return currentItem; + } + + private void ensureStackTopIsCurrent() { + while (!stack.isEmpty() && !stack.peek().hasNext()) { + stack.pop(); + } + } + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index fec1b29..a0e9267 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -29,6 +29,7 @@ import lombok.extern.slf4j.Slf4j; @Service public class SectionsBuilderService { + public void buildSections(ClassificationDocument document) { List chunkWords = new ArrayList<>(); @@ -71,7 +72,8 @@ public class SectionsBuilderService { chunkBlockList.add(chunkBlock); chunkWords = new ArrayList<>(); if (!chunkBlock.getTables().isEmpty()) { - previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1); + previousTable = chunkBlock.getTables() + .get(chunkBlock.getTables().size() - 1); } } if (current instanceof TablePageBlock table) { @@ -106,11 +108,12 @@ public class SectionsBuilderService { List sections = new ArrayList<>(); for (var page : document.getPages()) { - page.getTextBlocks().forEach(block -> { - block.setPage(page.getPageNumber()); - var section = buildTextBlock(List.of(block), Strings.EMPTY); - sections.add(section); - }); + page.getTextBlocks() + .forEach(block -> { + block.setPage(page.getPageNumber()); + var section = buildTextBlock(List.of(block), Strings.EMPTY); + sections.add(section); + }); } document.setSections(sections); } @@ -202,8 +205,14 @@ public class SectionsBuilderService { log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY()); log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax); - if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition() - .getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) { + if (xMin != null + && xMax != null + && yMin != null + && yMax != null + && image.getPosition().getX() >= xMin + && image.getPosition().getX() <= xMax + && image.getPosition().getY() >= yMin + && image.getPosition().getY() <= yMax) { section.getImages().add(image); image.setAppendedToSection(true); break; @@ -226,17 +235,26 @@ public class SectionsBuilderService { List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); // Allow merging of tables if header row is separated from first logical non-header row - if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { - previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> { - Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]); - fakeCell.setHeaderCells(Collections.singletonList(cell)); - return fakeCell; - }).collect(Collectors.toList()); + if (previousTableNonHeaderRow.isEmpty() + && previousTable.getRowCount() == 1 + && previousTable.getRows() + .get(0).size() == tableNonHeaderRow.size()) { + previousTableNonHeaderRow = previousTable.getRows() + .get(0) + .stream() + .map(cell -> { + Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]); + fakeCell.setHeaderCells(Collections.singletonList(cell)); + return fakeCell; + }) + .collect(Collectors.toList()); } if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = currentTable.getRows().get(i); - if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) { + List row = currentTable.getRows() + .get(i); + if (row.size() == tableNonHeaderRow.size() && row.stream() + .allMatch(cell -> cell.getHeaderCells().isEmpty())) { for (int j = 0; j < row.size(); j++) { row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); } @@ -279,7 +297,11 @@ public class SectionsBuilderService { private boolean hasInvalidHeaderInformation(TablePageBlock table) { - return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty(); + return table.getRows() + .stream() + .flatMap(row -> row.stream() + .filter(cell -> !cell.getHeaderCells().isEmpty())) + .findAny().isEmpty(); } @@ -287,7 +309,8 @@ public class SectionsBuilderService { private List getRowWithNonHeaderCells(TablePageBlock table) { for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = table.getRows().get(i); + List row = table.getRows() + .get(i); if (row.size() == 1) { continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 1481776..8b21ec0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -12,6 +12,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -35,6 +36,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; @@ -74,8 +76,14 @@ public class DocumentGraphFactory { private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) { - classificationDocument.getSections() - .forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document)); + //classificationDocument.getSections() + // .forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document)); + + for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { + var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection(); + Optional
section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document); + tocItem.setSection(section.orElse(null)); + } } @@ -85,14 +93,11 @@ public class DocumentGraphFactory { GenericSemanticNode node; if (originalTextBlock.isHeadline()) { - node = Headline.builder().documentTree(context.getDocumentTree()) - .build(); + node = Headline.builder().documentTree(context.getDocumentTree()).build(); } else if (originalTextBlock.isToDuplicate()) { - node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()) - .build(); + node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build(); } else { - node = Paragraph.builder().documentTree(context.getDocumentTree()) - .build(); + node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); } page.getMainBody().add(node); @@ -178,12 +183,8 @@ public class DocumentGraphFactory { private void addFooter(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()) - .build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), - footer, - context, - page); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); footer.setLeafTextBlock(textBlock); @@ -194,8 +195,7 @@ public class DocumentGraphFactory { public void addHeader(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Header header = Header.builder().documentTree(context.getDocumentTree()) - .build(); + Header header = Header.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); @@ -207,8 +207,7 @@ public class DocumentGraphFactory { private void addEmptyFooter(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()) - .build(); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); @@ -220,8 +219,7 @@ public class DocumentGraphFactory { private void addEmptyHeader(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Header header = Header.builder().documentTree(context.getDocumentTree()) - .build(); + Header header = Header.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index f4b26eb..90b2e8a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -9,6 +9,7 @@ import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; @@ -27,12 +28,12 @@ import lombok.experimental.UtilityClass; @UtilityClass public class SectionNodeFactory { - public void addSection(LayoutParsingType layoutParsingType, - GenericSemanticNode parentNode, - List pageBlocks, - List images, - DocumentGraphFactory.Context context, - Document document) { + public Optional
addSection(LayoutParsingType layoutParsingType, + GenericSemanticNode parentNode, + List pageBlocks, + List images, + DocumentGraphFactory.Context context, + Document document) { // This is for the case where we have images on a page without any text/footer/header. // The pageBlocks list is empty, but we still need to add those images to the document. @@ -40,11 +41,11 @@ public class SectionNodeFactory { images.stream() .distinct() .forEach(image -> DocumentGraphFactory.addImage(document, image, context)); - return; + return Optional.empty(); } if (pageBlocks.isEmpty()) { - return; + return Optional.empty(); } Map> blocksPerPage = pageBlocks.stream() @@ -73,6 +74,8 @@ public class SectionNodeFactory { images.stream() .distinct() .forEach(image -> DocumentGraphFactory.addImage(section, image, context)); + + return Optional.of(section); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 81cebbf..e9a091e 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -32,10 +32,10 @@ public class ViewerDocumentTest extends BuildDocumentTest { //String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf"; //String fileName = "files/new/kaust-official-thesis-template.pdf"; //String fileName = "files/new/$100m Offers.pdf"; - //String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; - String fileName = "files/new/UTT-Books-53.pdf"; + String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; //String fileName = "files/new/mistitled_outlines_example.pdf"; //String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf"; + //String fileName = "files/new/UTT-Books-53.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); @@ -48,6 +48,32 @@ public class ViewerDocumentTest extends BuildDocumentTest { System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); } + @Test + @SneakyThrows + public void testViewerDocumentWithImages() { + + String fileName = "files/new/UTT-Books-53.pdf"; + Path path = Path.of(fileName); + String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf"; + String imageFileName = "files/images/test_outlines.IMAGE_INFO.json"; + + var mapper = ObjectMapperFactory.create(); + var imageServiceResponse = mapper.readValue(new ClassPathResource(imageFileName).getInputStream(), ImageServiceResponse.class); + var documentFile = new ClassPathResource(fileName).getFile(); + + var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + documentFile, + imageServiceResponse, + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + Map.of("file", path.getFileName().toFile().toString())); + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); + Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); + + layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); + } + @Test @Disabled @@ -56,7 +82,8 @@ public class ViewerDocumentTest extends BuildDocumentTest { String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf"; String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json"; - String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; + Path path = Path.of(fileName); + String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf"; var mapper = ObjectMapperFactory.create(); var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class); @@ -67,7 +94,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { new ImageServiceResponse(), tableResponse, new VisualLayoutParsingResponse(), - Map.of("file", Path.of(fileName).getFileName().toFile().toString())); + Map.of("file", path.getFileName().toFile().toString())); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf index c56e8ac..1626c3f 100644 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf differ