RED-7074: Design Subsection section tree structure algorithm
* added toc enrichment logic and changed section computation to build upon created toc
This commit is contained in:
parent
9f9ea68706
commit
c071a133e6
@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.image.Classifi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
@ -101,6 +102,7 @@ public class LayoutParsingPipeline {
|
||||
GraphicExtractorService graphicExtractorService;
|
||||
OutlineExtractorService outlineExtractorService;
|
||||
OutlineValidationService outlineValidationService;
|
||||
TOCEnrichmentService tocEnrichmentService;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
@ -279,17 +281,17 @@ public class LayoutParsingPipeline {
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
||||
pdPage,
|
||||
pageNumber,
|
||||
cleanRulings,
|
||||
stripper.getTextPositionSequences(),
|
||||
emptyTableCells,
|
||||
false);
|
||||
pdPage,
|
||||
pageNumber,
|
||||
cleanRulings,
|
||||
stripper.getTextPositionSequences(),
|
||||
emptyTableCells,
|
||||
false);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
|
||||
.toList());
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
|
||||
.toList());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
||||
@ -372,6 +374,8 @@ public class LayoutParsingPipeline {
|
||||
default -> {
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
|
||||
tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,11 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
|
||||
import lombok.Data;
|
||||
@ -17,6 +21,7 @@ public class ClassificationDocument {
|
||||
|
||||
private List<ClassificationPage> pages = new ArrayList<>();
|
||||
private List<ClassificationSection> sections = new ArrayList<>();
|
||||
//private Map<TextPageBlock, List<AbstractPageBlock>> sectionsMap = new HashMap<>();
|
||||
private List<ClassificationHeader> headers = new ArrayList<>();
|
||||
private List<ClassificationFooter> footers = new ArrayList<>();
|
||||
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||
|
||||
@ -140,8 +140,8 @@ public class DocumentTree {
|
||||
if (treeId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
Entry entry = root.children.get(treeId.get(0));
|
||||
for (int id : treeId.subList(1, treeId.size())) {
|
||||
Entry entry = root;
|
||||
for (int id : treeId) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
|
||||
@ -39,7 +39,7 @@ public class OutlineValidationService {
|
||||
private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
|
||||
|
||||
for (TableOfContentItem existingItem : toc.getMainSections()) {
|
||||
if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) {
|
||||
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -82,7 +82,7 @@ public class OutlineValidationService {
|
||||
assert (parent != null);
|
||||
while (parentDepth < currentDepth && parent.getParent() != null) {
|
||||
parent = parent.getParent();
|
||||
parentDepth = getDepth(parent.getTextPageBlock().getClassification());
|
||||
parentDepth = getDepth(parent.getHeadline().getClassification());
|
||||
}
|
||||
parent.addChild(new TableOfContentItem(current));
|
||||
}
|
||||
@ -110,12 +110,12 @@ public class OutlineValidationService {
|
||||
|
||||
} else {
|
||||
assert last != null;
|
||||
int lastDepth = getDepth(last.getTextPageBlock().getClassification());
|
||||
int lastDepth = getDepth(last.getHeadline().getClassification());
|
||||
|
||||
if (lastDepth < parentDepth) {
|
||||
parentDepth = lastDepth;
|
||||
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
||||
parentDepth = getDepth(last.getParent().getTextPageBlock().getClassification());
|
||||
parentDepth = getDepth(last.getParent().getHeadline().getClassification());
|
||||
}
|
||||
|
||||
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
|
||||
|
||||
@ -0,0 +1,266 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class TOCEnrichmentService {
|
||||
|
||||
public void assignSectionBlocksAndImages(ClassificationDocument document) {
|
||||
|
||||
TableOfContents toc = document.getTableOfContents();
|
||||
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
||||
List<ClassifiedImage> startImages = new ArrayList<>();
|
||||
//Map<TableOfContentItem, List<AbstractPageBlock>> sectionsMap = new HashMap<>();
|
||||
TableOfContentItem currentSection = null;
|
||||
boolean foundFirstHeadline = false;
|
||||
|
||||
//for (TableOfContentItem item : toc.getAllTableOfContentItems()) {
|
||||
// sectionsMap.put(item, new ArrayList<>());
|
||||
//}
|
||||
|
||||
List<ClassificationHeader> headers = new ArrayList<>();
|
||||
List<ClassificationFooter> footers = new ArrayList<>();
|
||||
TablePageBlock previousTable = null;
|
||||
List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
List<TableOfContentItem> currentPageTOCItems = new ArrayList<>();
|
||||
List<TextPageBlock> header = new ArrayList<>();
|
||||
List<TextPageBlock> footer = new ArrayList<>();
|
||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||
|
||||
if (current.getClassification() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
current.setPage(page.getPageNumber());
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.HEADER)) {
|
||||
header.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.FOOTER)) {
|
||||
footer.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current instanceof TablePageBlock table) {
|
||||
if (previousTable != null) {
|
||||
mergeTableMetadata(table, previousTable);
|
||||
}
|
||||
previousTable = table;
|
||||
}
|
||||
boolean matched = false;
|
||||
|
||||
for (TableOfContentItem tocItem : toc) {
|
||||
if (current instanceof TextPageBlock && tocItem.getHeadline().getText().equals(current.getText())) {
|
||||
if (!foundFirstHeadline) {
|
||||
foundFirstHeadline = true;
|
||||
}
|
||||
currentSection = tocItem;
|
||||
//sectionsMap.get(tocItem).add(current);
|
||||
tocItem.getSectionBlocks().add(current);
|
||||
currentPageTOCItems.add(tocItem);
|
||||
matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!matched) {
|
||||
if (!foundFirstHeadline) {
|
||||
startBlocks.add(current);
|
||||
} else {
|
||||
currentSection.getSectionBlocks().add(current);
|
||||
//sectionsMap.get(currentSection).add(current);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!currentPageTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems = currentPageTOCItems;
|
||||
}
|
||||
|
||||
for (ClassifiedImage image : page.getImages()) {
|
||||
|
||||
Float xMin = null;
|
||||
Float yMin = null;
|
||||
Float xMax = null;
|
||||
Float yMax = null;
|
||||
|
||||
for (TableOfContentItem tocItem : lastFoundTOCItems) {
|
||||
var headline = tocItem.getHeadline();
|
||||
|
||||
if (headline.getPage() != page.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (headline.getMinX() < headline.getMaxX()) {
|
||||
if (xMin == null || headline.getMinX() < xMin) {
|
||||
xMin = headline.getMinX();
|
||||
}
|
||||
if (xMax == null || headline.getMaxX() > xMax) {
|
||||
xMax = headline.getMaxX();
|
||||
}
|
||||
} else {
|
||||
if (xMin == null || headline.getMaxX() < xMin) {
|
||||
xMin = headline.getMaxX();
|
||||
}
|
||||
if (xMax == null || headline.getMinX() > xMax) {
|
||||
xMax = headline.getMinX();
|
||||
}
|
||||
}
|
||||
|
||||
if (headline.getMinY() < headline.getMaxY()) {
|
||||
if (yMin == null || headline.getMinY() < yMin) {
|
||||
yMin = headline.getMinY();
|
||||
}
|
||||
if (yMax == null || headline.getMaxY() > yMax) {
|
||||
yMax = headline.getMaxY();
|
||||
}
|
||||
} else {
|
||||
if (yMin == null || headline.getMaxY() < yMin) {
|
||||
yMin = headline.getMaxY();
|
||||
}
|
||||
if (yMax == null || headline.getMinY() > yMax) {
|
||||
yMax = headline.getMinY();
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
tocItem.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!image.isAppendedToSection()) {
|
||||
log.debug("Image uses first paragraph");
|
||||
if (!lastFoundTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems.get(0).getImages().add(image);
|
||||
} else {
|
||||
startImages.add(image);
|
||||
}
|
||||
image.setAppendedToSection(true);
|
||||
}
|
||||
}
|
||||
|
||||
if (!header.isEmpty()) {
|
||||
headers.add(new ClassificationHeader(header));
|
||||
}
|
||||
if (!footer.isEmpty()) {
|
||||
footers.add(new ClassificationFooter(footer));
|
||||
}
|
||||
}
|
||||
|
||||
if (!startBlocks.isEmpty()) {
|
||||
TableOfContentItem unassigned = new TableOfContentItem(null);
|
||||
unassigned.setSectionBlocks(startBlocks);
|
||||
unassigned.setImages(startImages);
|
||||
document.getTableOfContents().getMainSections().add(0, unassigned);
|
||||
}
|
||||
//document.setSectionsMap(sectionsMap);
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty()
|
||||
&& previousTable.getRowCount() == 1
|
||||
&& previousTable.getRows()
|
||||
.get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
})
|
||||
.toList();
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows()
|
||||
.get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean hasValidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return !hasInvalidHeaderInformation(table);
|
||||
}
|
||||
|
||||
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(row -> row.stream()
|
||||
.filter(cell -> !cell.getHeaderCells().isEmpty()))
|
||||
.findAny().isEmpty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows()
|
||||
.get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
boolean allNonHeader = true;
|
||||
for (Cell cell : row) {
|
||||
if (cell.isHeaderCell()) {
|
||||
allNonHeader = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allNonHeader) {
|
||||
return row;
|
||||
}
|
||||
}
|
||||
|
||||
return Collections.emptyList();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,9 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
@ -14,14 +18,19 @@ import lombok.EqualsAndHashCode;
|
||||
public class TableOfContentItem {
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
private TextPageBlock textPageBlock;
|
||||
private TextPageBlock headline;
|
||||
private List<TableOfContentItem> children = new ArrayList<>();
|
||||
private TableOfContentItem parent;
|
||||
|
||||
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
public TableOfContentItem(TextPageBlock textPageBlock) {
|
||||
private Section section;
|
||||
|
||||
this.textPageBlock = textPageBlock;
|
||||
|
||||
public TableOfContentItem(TextPageBlock headline) {
|
||||
|
||||
this.headline = headline;
|
||||
}
|
||||
|
||||
|
||||
@ -34,60 +43,68 @@ public class TableOfContentItem {
|
||||
|
||||
public TableOfContentItem getSiblingBefore() {
|
||||
|
||||
try {
|
||||
return parent.getChildren()
|
||||
.get(parent.getChildren().indexOf(this) - 1);
|
||||
} catch (IndexOutOfBoundsException indexOutOfBoundsException) {
|
||||
return null;
|
||||
if (parent != null) {
|
||||
int index = parent.getChildren().indexOf(this);
|
||||
if (index > 0) {
|
||||
return parent.getChildren()
|
||||
.get(index - 1);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public TableOfContentItem getSiblingAfter() {
|
||||
|
||||
try {
|
||||
return parent.getChildren()
|
||||
.get(parent.getChildren().indexOf(this) + 1);
|
||||
} catch (IndexOutOfBoundsException indexOutOfBoundsException) {
|
||||
return null;
|
||||
if (parent != null) {
|
||||
int index = parent.getChildren().indexOf(this);
|
||||
if (index >= 0 && index < parent.getChildren().size() - 1) {
|
||||
return parent.getChildren()
|
||||
.get(index + 1);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(TextPageBlock block) {
|
||||
|
||||
boolean anyChildContains = false;
|
||||
if (!children.isEmpty()) {
|
||||
for (TableOfContentItem child : children) {
|
||||
if (child.getTextPageBlock().equals(block)) {
|
||||
return true;
|
||||
} else {
|
||||
anyChildContains = anyChildContains || child.contains(block);
|
||||
}
|
||||
if (headline.equals(block)) {
|
||||
return true;
|
||||
}
|
||||
for (TableOfContentItem child : children) {
|
||||
if (child.contains(block)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return anyChildContains;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(TableOfContentItem tocItem) {
|
||||
|
||||
boolean anyChildContains = false;
|
||||
if (!children.isEmpty()) {
|
||||
for (TableOfContentItem child : children) {
|
||||
if (child.equals(tocItem)) {
|
||||
return true;
|
||||
} else {
|
||||
anyChildContains = anyChildContains || child.contains(tocItem);
|
||||
}
|
||||
if (this.equals(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
for (TableOfContentItem child : children) {
|
||||
if (child.contains(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return anyChildContains;
|
||||
return false;
|
||||
}
|
||||
|
||||
public List<AbstractPageBlock> getNonEmptySectionBlocks() {
|
||||
|
||||
return sectionBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "OutlineObjectTreeNode{" + "textPageBlock=" + textPageBlock + '}';
|
||||
return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}';
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Stack;
|
||||
|
||||
import org.springframework.lang.NonNull;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
@ -12,7 +14,7 @@ import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class TableOfContents {
|
||||
public class TableOfContents implements Iterable<TableOfContentItem> {
|
||||
|
||||
private List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
|
||||
@ -35,7 +37,7 @@ public class TableOfContents {
|
||||
|
||||
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
|
||||
|
||||
textPageBlocks.add(item.getTextPageBlock());
|
||||
textPageBlocks.add(item.getHeadline());
|
||||
for (TableOfContentItem child : item.getChildren()) {
|
||||
collectTextPageBlocks(child, textPageBlocks);
|
||||
}
|
||||
@ -56,4 +58,40 @@ public class TableOfContents {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public @NonNull Iterator<TableOfContentItem> iterator() {
|
||||
|
||||
return new TableOfContentItemIterator(mainSections);
|
||||
}
|
||||
|
||||
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
|
||||
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
|
||||
|
||||
public TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
|
||||
stack.push(mainSections.iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
ensureStackTopIsCurrent();
|
||||
return !stack.isEmpty() && stack.peek().hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TableOfContentItem next() {
|
||||
ensureStackTopIsCurrent();
|
||||
TableOfContentItem currentItem = stack.peek().next();
|
||||
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
|
||||
stack.push(currentItem.getChildren().iterator());
|
||||
}
|
||||
return currentItem;
|
||||
}
|
||||
|
||||
private void ensureStackTopIsCurrent() {
|
||||
while (!stack.isEmpty() && !stack.peek().hasNext()) {
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -29,6 +29,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Service
|
||||
public class SectionsBuilderService {
|
||||
|
||||
|
||||
public void buildSections(ClassificationDocument document) {
|
||||
|
||||
List<AbstractPageBlock> chunkWords = new ArrayList<>();
|
||||
@ -71,7 +72,8 @@ public class SectionsBuilderService {
|
||||
chunkBlockList.add(chunkBlock);
|
||||
chunkWords = new ArrayList<>();
|
||||
if (!chunkBlock.getTables().isEmpty()) {
|
||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||
previousTable = chunkBlock.getTables()
|
||||
.get(chunkBlock.getTables().size() - 1);
|
||||
}
|
||||
}
|
||||
if (current instanceof TablePageBlock table) {
|
||||
@ -106,11 +108,12 @@ public class SectionsBuilderService {
|
||||
|
||||
List<ClassificationSection> sections = new ArrayList<>();
|
||||
for (var page : document.getPages()) {
|
||||
page.getTextBlocks().forEach(block -> {
|
||||
block.setPage(page.getPageNumber());
|
||||
var section = buildTextBlock(List.of(block), Strings.EMPTY);
|
||||
sections.add(section);
|
||||
});
|
||||
page.getTextBlocks()
|
||||
.forEach(block -> {
|
||||
block.setPage(page.getPageNumber());
|
||||
var section = buildTextBlock(List.of(block), Strings.EMPTY);
|
||||
sections.add(section);
|
||||
});
|
||||
}
|
||||
document.setSections(sections);
|
||||
}
|
||||
@ -202,8 +205,14 @@ public class SectionsBuilderService {
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
|
||||
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
if (xMin != null
|
||||
&& xMax != null
|
||||
&& yMin != null
|
||||
&& yMax != null
|
||||
&& image.getPosition().getX() >= xMin
|
||||
&& image.getPosition().getX() <= xMax
|
||||
&& image.getPosition().getY() >= yMin
|
||||
&& image.getPosition().getY() <= yMax) {
|
||||
section.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
break;
|
||||
@ -226,17 +235,26 @@ public class SectionsBuilderService {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
|
||||
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
}).collect(Collectors.toList());
|
||||
if (previousTableNonHeaderRow.isEmpty()
|
||||
&& previousTable.getRowCount() == 1
|
||||
&& previousTable.getRows()
|
||||
.get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
List<Cell> row = currentTable.getRows()
|
||||
.get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
@ -279,7 +297,11 @@ public class SectionsBuilderService {
|
||||
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(row -> row.stream()
|
||||
.filter(cell -> !cell.getHeaderCells().isEmpty()))
|
||||
.findAny().isEmpty();
|
||||
|
||||
}
|
||||
|
||||
@ -287,7 +309,8 @@ public class SectionsBuilderService {
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
List<Cell> row = table.getRows()
|
||||
.get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -12,6 +12,7 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ -35,6 +36,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
@ -74,8 +76,14 @@ public class DocumentGraphFactory {
|
||||
|
||||
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
||||
|
||||
classificationDocument.getSections()
|
||||
.forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
|
||||
//classificationDocument.getSections()
|
||||
// .forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
|
||||
|
||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
||||
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document);
|
||||
tocItem.setSection(section.orElse(null));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -85,14 +93,11 @@ public class DocumentGraphFactory {
|
||||
|
||||
GenericSemanticNode node;
|
||||
if (originalTextBlock.isHeadline()) {
|
||||
node = Headline.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else if (originalTextBlock.isToDuplicate()) {
|
||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
|
||||
page.getMainBody().add(node);
|
||||
@ -178,12 +183,8 @@ public class DocumentGraphFactory {
|
||||
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
@ -194,8 +195,7 @@ public class DocumentGraphFactory {
|
||||
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
@ -207,8 +207,7 @@ public class DocumentGraphFactory {
|
||||
private void addEmptyFooter(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
@ -220,8 +219,7 @@ public class DocumentGraphFactory {
|
||||
private void addEmptyHeader(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
|
||||
@ -9,6 +9,7 @@ import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
@ -27,12 +28,12 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class SectionNodeFactory {
|
||||
|
||||
public void addSection(LayoutParsingType layoutParsingType,
|
||||
GenericSemanticNode parentNode,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
List<ClassifiedImage> images,
|
||||
DocumentGraphFactory.Context context,
|
||||
Document document) {
|
||||
public Optional<Section> addSection(LayoutParsingType layoutParsingType,
|
||||
GenericSemanticNode parentNode,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
List<ClassifiedImage> images,
|
||||
DocumentGraphFactory.Context context,
|
||||
Document document) {
|
||||
|
||||
// This is for the case where we have images on a page without any text/footer/header.
|
||||
// The pageBlocks list is empty, but we still need to add those images to the document.
|
||||
@ -40,11 +41,11 @@ public class SectionNodeFactory {
|
||||
images.stream()
|
||||
.distinct()
|
||||
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
|
||||
return;
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
if (pageBlocks.isEmpty()) {
|
||||
return;
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||
@ -73,6 +74,8 @@ public class SectionNodeFactory {
|
||||
images.stream()
|
||||
.distinct()
|
||||
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||
|
||||
return Optional.of(section);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -32,10 +32,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||
//String fileName = "files/new/$100m Offers.pdf";
|
||||
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
String fileName = "files/new/UTT-Books-53.pdf";
|
||||
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
@ -48,6 +48,32 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocumentWithImages() {
|
||||
|
||||
String fileName = "files/new/UTT-Books-53.pdf";
|
||||
Path path = Path.of(fileName);
|
||||
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
|
||||
String imageFileName = "files/images/test_outlines.IMAGE_INFO.json";
|
||||
|
||||
var mapper = ObjectMapperFactory.create();
|
||||
var imageServiceResponse = mapper.readValue(new ClassPathResource(imageFileName).getInputStream(), ImageServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
documentFile,
|
||||
imageServiceResponse,
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", path.getFileName().toFile().toString()));
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@ -56,7 +82,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf";
|
||||
String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
Path path = Path.of(fileName);
|
||||
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var mapper = ObjectMapperFactory.create();
|
||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||
@ -67,7 +94,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||
Map.of("file", path.getFileName().toFile().toString()));
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user