RED-7074: Design Subsection section tree structure algorithm

* added toc enrichment logic and changed section computation to build upon created toc
This commit is contained in:
maverickstuder 2024-04-30 14:41:17 +02:00
parent 9f9ea68706
commit c071a133e6
12 changed files with 482 additions and 101 deletions

View File

@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.image.Classifi
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -101,6 +102,7 @@ public class LayoutParsingPipeline {
GraphicExtractorService graphicExtractorService;
OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
TOCEnrichmentService tocEnrichmentService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -279,17 +281,17 @@ public class LayoutParsingPipeline {
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
pdPage,
pageNumber,
cleanRulings,
stripper.getTextPositionSequences(),
emptyTableCells,
false);
pdPage,
pageNumber,
cleanRulings,
stripper.getTextPositionSequences(),
emptyTableCells,
false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
.toList());
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
.toList());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
@ -372,6 +374,8 @@ public class LayoutParsingPipeline {
default -> {
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
}
}

View File

@ -1,11 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import lombok.Data;
@ -17,6 +21,7 @@ public class ClassificationDocument {
private List<ClassificationPage> pages = new ArrayList<>();
private List<ClassificationSection> sections = new ArrayList<>();
//private Map<TextPageBlock, List<AbstractPageBlock>> sectionsMap = new HashMap<>();
private List<ClassificationHeader> headers = new ArrayList<>();
private List<ClassificationFooter> footers = new ArrayList<>();
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();

View File

@ -140,8 +140,8 @@ public class DocumentTree {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
Entry entry = root;
for (int id : treeId) {
entry = entry.children.get(id);
}
return entry;

View File

@ -39,7 +39,7 @@ public class OutlineValidationService {
private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
for (TableOfContentItem existingItem : toc.getMainSections()) {
if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) {
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
return true;
}
}
@ -82,7 +82,7 @@ public class OutlineValidationService {
assert (parent != null);
while (parentDepth < currentDepth && parent.getParent() != null) {
parent = parent.getParent();
parentDepth = getDepth(parent.getTextPageBlock().getClassification());
parentDepth = getDepth(parent.getHeadline().getClassification());
}
parent.addChild(new TableOfContentItem(current));
}
@ -110,12 +110,12 @@ public class OutlineValidationService {
} else {
assert last != null;
int lastDepth = getDepth(last.getTextPageBlock().getClassification());
int lastDepth = getDepth(last.getHeadline().getClassification());
if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getDepth(last.getParent().getTextPageBlock().getClassification());
parentDepth = getDepth(last.getParent().getHeadline().getClassification());
}
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);

View File

@ -0,0 +1,266 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class TOCEnrichmentService {
public void assignSectionBlocksAndImages(ClassificationDocument document) {
TableOfContents toc = document.getTableOfContents();
List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>();
//Map<TableOfContentItem, List<AbstractPageBlock>> sectionsMap = new HashMap<>();
TableOfContentItem currentSection = null;
boolean foundFirstHeadline = false;
//for (TableOfContentItem item : toc.getAllTableOfContentItems()) {
// sectionsMap.put(item, new ArrayList<>());
//}
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null;
List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>();
for (ClassificationPage page : document.getPages()) {
List<TableOfContentItem> currentPageTOCItems = new ArrayList<>();
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
if (current.getClassification() == null) {
continue;
}
current.setPage(page.getPageNumber());
if (current.getClassification().equals(PageBlockType.HEADER)) {
header.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals(PageBlockType.FOOTER)) {
footer.add((TextPageBlock) current);
continue;
}
if (current instanceof TablePageBlock table) {
if (previousTable != null) {
mergeTableMetadata(table, previousTable);
}
previousTable = table;
}
boolean matched = false;
for (TableOfContentItem tocItem : toc) {
if (current instanceof TextPageBlock && tocItem.getHeadline().getText().equals(current.getText())) {
if (!foundFirstHeadline) {
foundFirstHeadline = true;
}
currentSection = tocItem;
//sectionsMap.get(tocItem).add(current);
tocItem.getSectionBlocks().add(current);
currentPageTOCItems.add(tocItem);
matched = true;
break;
}
}
if (!matched) {
if (!foundFirstHeadline) {
startBlocks.add(current);
} else {
currentSection.getSectionBlocks().add(current);
//sectionsMap.get(currentSection).add(current);
}
}
}
if (!currentPageTOCItems.isEmpty()) {
lastFoundTOCItems = currentPageTOCItems;
}
for (ClassifiedImage image : page.getImages()) {
Float xMin = null;
Float yMin = null;
Float xMax = null;
Float yMax = null;
for (TableOfContentItem tocItem : lastFoundTOCItems) {
var headline = tocItem.getHeadline();
if (headline.getPage() != page.getPageNumber()) {
continue;
}
if (headline.getMinX() < headline.getMaxX()) {
if (xMin == null || headline.getMinX() < xMin) {
xMin = headline.getMinX();
}
if (xMax == null || headline.getMaxX() > xMax) {
xMax = headline.getMaxX();
}
} else {
if (xMin == null || headline.getMaxX() < xMin) {
xMin = headline.getMaxX();
}
if (xMax == null || headline.getMinX() > xMax) {
xMax = headline.getMinX();
}
}
if (headline.getMinY() < headline.getMaxY()) {
if (yMin == null || headline.getMinY() < yMin) {
yMin = headline.getMinY();
}
if (yMax == null || headline.getMaxY() > yMax) {
yMax = headline.getMaxY();
}
} else {
if (yMin == null || headline.getMaxY() < yMin) {
yMin = headline.getMaxY();
}
if (yMax == null || headline.getMinY() > yMax) {
yMax = headline.getMinY();
}
}
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
tocItem.getImages().add(image);
image.setAppendedToSection(true);
break;
}
}
if (!image.isAppendedToSection()) {
log.debug("Image uses first paragraph");
if (!lastFoundTOCItems.isEmpty()) {
lastFoundTOCItems.get(0).getImages().add(image);
} else {
startImages.add(image);
}
image.setAppendedToSection(true);
}
}
if (!header.isEmpty()) {
headers.add(new ClassificationHeader(header));
}
if (!footer.isEmpty()) {
footers.add(new ClassificationFooter(footer));
}
}
if (!startBlocks.isEmpty()) {
TableOfContentItem unassigned = new TableOfContentItem(null);
unassigned.setSectionBlocks(startBlocks);
unassigned.setImages(startImages);
document.getTableOfContents().getMainSections().add(0, unassigned);
}
//document.setSectionsMap(sectionsMap);
document.setHeaders(headers);
document.setFooters(footers);
}
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty()
&& previousTable.getRowCount() == 1
&& previousTable.getRows()
.get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows()
.get(0)
.stream()
.map(cell -> {
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})
.toList();
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows()
.get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
}
private boolean hasValidHeaderInformation(TablePageBlock table) {
return !hasInvalidHeaderInformation(table);
}
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows()
.stream()
.flatMap(row -> row.stream()
.filter(cell -> !cell.getHeaderCells().isEmpty()))
.findAny().isEmpty();
}
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows()
.get(i);
if (row.size() == 1) {
continue;
}
boolean allNonHeader = true;
for (Cell cell : row) {
if (cell.isHeaderCell()) {
allNonHeader = false;
break;
}
}
if (allNonHeader) {
return row;
}
}
return Collections.emptyList();
}
}

View File

@ -1,9 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Data;
@ -14,14 +18,19 @@ import lombok.EqualsAndHashCode;
public class TableOfContentItem {
@EqualsAndHashCode.Include
private TextPageBlock textPageBlock;
private TextPageBlock headline;
private List<TableOfContentItem> children = new ArrayList<>();
private TableOfContentItem parent;
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
public TableOfContentItem(TextPageBlock textPageBlock) {
private Section section;
this.textPageBlock = textPageBlock;
public TableOfContentItem(TextPageBlock headline) {
this.headline = headline;
}
@ -34,60 +43,68 @@ public class TableOfContentItem {
public TableOfContentItem getSiblingBefore() {
try {
return parent.getChildren()
.get(parent.getChildren().indexOf(this) - 1);
} catch (IndexOutOfBoundsException indexOutOfBoundsException) {
return null;
if (parent != null) {
int index = parent.getChildren().indexOf(this);
if (index > 0) {
return parent.getChildren()
.get(index - 1);
}
}
return null;
}
public TableOfContentItem getSiblingAfter() {
try {
return parent.getChildren()
.get(parent.getChildren().indexOf(this) + 1);
} catch (IndexOutOfBoundsException indexOutOfBoundsException) {
return null;
if (parent != null) {
int index = parent.getChildren().indexOf(this);
if (index >= 0 && index < parent.getChildren().size() - 1) {
return parent.getChildren()
.get(index + 1);
}
}
return null;
}
public boolean contains(TextPageBlock block) {
boolean anyChildContains = false;
if (!children.isEmpty()) {
for (TableOfContentItem child : children) {
if (child.getTextPageBlock().equals(block)) {
return true;
} else {
anyChildContains = anyChildContains || child.contains(block);
}
if (headline.equals(block)) {
return true;
}
for (TableOfContentItem child : children) {
if (child.contains(block)) {
return true;
}
}
return anyChildContains;
return false;
}
public boolean contains(TableOfContentItem tocItem) {
boolean anyChildContains = false;
if (!children.isEmpty()) {
for (TableOfContentItem child : children) {
if (child.equals(tocItem)) {
return true;
} else {
anyChildContains = anyChildContains || child.contains(tocItem);
}
if (this.equals(tocItem)) {
return true;
}
for (TableOfContentItem child : children) {
if (child.contains(tocItem)) {
return true;
}
}
return anyChildContains;
return false;
}
public List<AbstractPageBlock> getNonEmptySectionBlocks() {
return sectionBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList());
}
@Override
public String toString() {
return "OutlineObjectTreeNode{" + "textPageBlock=" + textPageBlock + '}';
return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}';
}
}

View File

@ -1,9 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import org.springframework.lang.NonNull;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -12,7 +14,7 @@ import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class TableOfContents {
public class TableOfContents implements Iterable<TableOfContentItem> {
private List<TableOfContentItem> mainSections = new ArrayList<>();
@ -35,7 +37,7 @@ public class TableOfContents {
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
textPageBlocks.add(item.getTextPageBlock());
textPageBlocks.add(item.getHeadline());
for (TableOfContentItem child : item.getChildren()) {
collectTextPageBlocks(child, textPageBlocks);
}
@ -56,4 +58,40 @@ public class TableOfContents {
}
}
@Override
public @NonNull Iterator<TableOfContentItem> iterator() {
return new TableOfContentItemIterator(mainSections);
}
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
public TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
stack.push(mainSections.iterator());
}
@Override
public boolean hasNext() {
ensureStackTopIsCurrent();
return !stack.isEmpty() && stack.peek().hasNext();
}
@Override
public TableOfContentItem next() {
ensureStackTopIsCurrent();
TableOfContentItem currentItem = stack.peek().next();
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
stack.push(currentItem.getChildren().iterator());
}
return currentItem;
}
private void ensureStackTopIsCurrent() {
while (!stack.isEmpty() && !stack.peek().hasNext()) {
stack.pop();
}
}
}
}

View File

@ -29,6 +29,7 @@ import lombok.extern.slf4j.Slf4j;
@Service
public class SectionsBuilderService {
public void buildSections(ClassificationDocument document) {
List<AbstractPageBlock> chunkWords = new ArrayList<>();
@ -71,7 +72,8 @@ public class SectionsBuilderService {
chunkBlockList.add(chunkBlock);
chunkWords = new ArrayList<>();
if (!chunkBlock.getTables().isEmpty()) {
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
previousTable = chunkBlock.getTables()
.get(chunkBlock.getTables().size() - 1);
}
}
if (current instanceof TablePageBlock table) {
@ -106,11 +108,12 @@ public class SectionsBuilderService {
List<ClassificationSection> sections = new ArrayList<>();
for (var page : document.getPages()) {
page.getTextBlocks().forEach(block -> {
block.setPage(page.getPageNumber());
var section = buildTextBlock(List.of(block), Strings.EMPTY);
sections.add(section);
});
page.getTextBlocks()
.forEach(block -> {
block.setPage(page.getPageNumber());
var section = buildTextBlock(List.of(block), Strings.EMPTY);
sections.add(section);
});
}
document.setSections(sections);
}
@ -202,8 +205,14 @@ public class SectionsBuilderService {
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
if (xMin != null
&& xMax != null
&& yMin != null
&& yMax != null
&& image.getPosition().getX() >= xMin
&& image.getPosition().getX() <= xMax
&& image.getPosition().getY() >= yMin
&& image.getPosition().getY() <= yMax) {
section.getImages().add(image);
image.setAppendedToSection(true);
break;
@ -226,17 +235,26 @@ public class SectionsBuilderService {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
}).collect(Collectors.toList());
if (previousTableNonHeaderRow.isEmpty()
&& previousTable.getRowCount() == 1
&& previousTable.getRows()
.get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows()
.get(0)
.stream()
.map(cell -> {
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})
.collect(Collectors.toList());
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
List<Cell> row = currentTable.getRows()
.get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
@ -279,7 +297,11 @@ public class SectionsBuilderService {
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
return table.getRows()
.stream()
.flatMap(row -> row.stream()
.filter(cell -> !cell.getHeaderCells().isEmpty()))
.findAny().isEmpty();
}
@ -287,7 +309,8 @@ public class SectionsBuilderService {
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
List<Cell> row = table.getRows()
.get(i);
if (row.size() == 1) {
continue;
}

View File

@ -12,6 +12,7 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
@ -35,6 +36,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@ -74,8 +76,14 @@ public class DocumentGraphFactory {
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
classificationDocument.getSections()
.forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
//classificationDocument.getSections()
// .forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document);
tocItem.setSection(section.orElse(null));
}
}
@ -85,14 +93,11 @@ public class DocumentGraphFactory {
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree())
.build();
node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.isToDuplicate()) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree())
.build();
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree())
.build();
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
}
page.getMainBody().add(node);
@ -178,12 +183,8 @@ public class DocumentGraphFactory {
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
footer,
context,
page);
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
@ -194,8 +195,7 @@ public class DocumentGraphFactory {
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
@ -207,8 +207,7 @@ public class DocumentGraphFactory {
private void addEmptyFooter(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
@ -220,8 +219,7 @@ public class DocumentGraphFactory {
private void addEmptyHeader(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);

View File

@ -9,6 +9,7 @@ import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -27,12 +28,12 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class SectionNodeFactory {
public void addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
Document document) {
public Optional<Section> addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
Document document) {
// This is for the case where we have images on a page without any text/footer/header.
// The pageBlocks list is empty, but we still need to add those images to the document.
@ -40,11 +41,11 @@ public class SectionNodeFactory {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
return;
return Optional.empty();
}
if (pageBlocks.isEmpty()) {
return;
return Optional.empty();
}
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
@ -73,6 +74,8 @@ public class SectionNodeFactory {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
return Optional.of(section);
}

View File

@ -32,10 +32,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
//String fileName = "files/new/kaust-official-thesis-template.pdf";
//String fileName = "files/new/$100m Offers.pdf";
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
String fileName = "files/new/UTT-Books-53.pdf";
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/mistitled_outlines_example.pdf";
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
//String fileName = "files/new/UTT-Books-53.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
@ -48,6 +48,32 @@ public class ViewerDocumentTest extends BuildDocumentTest {
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@Test
@SneakyThrows
public void testViewerDocumentWithImages() {
String fileName = "files/new/UTT-Books-53.pdf";
Path path = Path.of(fileName);
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
String imageFileName = "files/images/test_outlines.IMAGE_INFO.json";
var mapper = ObjectMapperFactory.create();
var imageServiceResponse = mapper.readValue(new ClassPathResource(imageFileName).getInputStream(), ImageServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
documentFile,
imageServiceResponse,
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file", path.getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
}
@Test
@Disabled
@ -56,7 +82,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf";
String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
Path path = Path.of(fileName);
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
var mapper = ObjectMapperFactory.create();
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
@ -67,7 +94,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
Map.of("file", path.getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);