RED-9139: add new TableOfContents Node

* rename previous TableOfContent to SectionTree
* added protobuf compile script
This commit is contained in:
Kilian Schuettler 2024-11-08 14:57:46 +01:00
parent 4b86307936
commit 621ebd7378
4 changed files with 305 additions and 307 deletions

View File

@ -39,8 +39,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -105,8 +105,8 @@ public class LayoutParsingPipeline {
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
GraphicExtractorService graphicExtractorService;
OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
SectionTreeBuilderService sectionTreeBuilderService;
SectionTreeEnhancementService sectionTreeEnhancementService;
LayoutparserSettings settings;
ClassificationService classificationService;
@ -344,14 +344,14 @@ public class LayoutParsingPipeline {
classificationService.classify(classificationDocument, layoutParsingType, identifier);
SectionTree sectionTree = outlineValidationService.createSectionTree(classificationDocument);
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
classificationDocument.setSectionTree(sectionTree);
log.info("Building Sections for {}", identifier);
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> sectionTreeBuilderService.assignSectionBlocksAndImages(classificationDocument);
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
}
return classificationDocument;

View File

@ -1,84 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import io.micrometer.observation.annotation.Observed;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineValidationService {
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
List<SectionTreeEntry> mainSections = new ArrayList<>();
Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
SectionTreeEntry last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new SectionTreeEntry(current);
if (parentDepth == null) {
mainSections.add(tocItem);
lastItemsPerDepth = new HashMap<>();
depths = new TreeSet<>();
} else {
assert last != null;
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
// headline after toc should always start a main section
parentDepth = 1;
} else if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
}
SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
last = tocItem;
lastItemsPerDepth.put(currentDepth, tocItem);
depths.add(currentDepth);
}
return new
SectionTree(mainSections);
}
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
return classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
}
}

View File

@ -1,252 +1,82 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import io.micrometer.observation.annotation.Observed;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@Slf4j
public class SectionTreeBuilderService {
public void assignSectionBlocksAndImages(ClassificationDocument document) {
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
SectionTree toc = document.getSectionTree();
Iterator<SectionTreeEntry> iterator = toc.iterator();
SectionTreeEntry currentTOCItem = null;
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>();
SectionTreeEntry currentSection = null;
boolean foundFirstHeadline = false;
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null;
List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
List<SectionTreeEntry> mainSections = new ArrayList<>();
Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
SectionTreeEntry last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (ClassificationPage page : document.getPages()) {
List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
for (TextPageBlock current : headlines) {
int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
if (current.getClassification() == null) {
continue;
var tocItem = new SectionTreeEntry(current);
if (parentDepth == null) {
mainSections.add(tocItem);
lastItemsPerDepth = new HashMap<>();
depths = new TreeSet<>();
} else {
assert last != null;
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
// headline after toc should always start a main section
parentDepth = 1;
} else if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
}
current.setPage(page.getPageNumber());
if (current.getClassification().equals(PageBlockType.HEADER)) {
header.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals(PageBlockType.FOOTER)) {
footer.add((TextPageBlock) current);
continue;
}
if (current instanceof TablePageBlock table) {
if (previousTable != null) {
mergeTableMetadata(table, previousTable);
}
previousTable = table;
}
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
if (!foundFirstHeadline) {
foundFirstHeadline = true;
}
currentSection = currentTOCItem;
currentTOCItem.getSectionBlocks().add(current);
currentPageTOCItems.add(currentTOCItem);
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
} else if (!foundFirstHeadline) {
startBlocks.add(current);
} else {
currentSection.getSectionBlocks().add(current);
}
SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
if (!currentPageTOCItems.isEmpty()) {
lastFoundTOCItems = currentPageTOCItems;
}
for (ClassifiedImage image : page.getImages()) {
Double xMin = null;
Double yMin = null;
Double xMax = null;
Double yMax = null;
for (SectionTreeEntry tocItem : lastFoundTOCItems) {
var headline = tocItem.getHeadline();
if (headline.getPage() != page.getPageNumber()) {
continue;
}
if (headline.getMinX() < headline.getMaxX()) {
if (xMin == null || headline.getMinX() < xMin) {
xMin = headline.getMinX();
}
if (xMax == null || headline.getMaxX() > xMax) {
xMax = headline.getMaxX();
}
} else {
if (xMin == null || headline.getMaxX() < xMin) {
xMin = headline.getMaxX();
}
if (xMax == null || headline.getMinX() > xMax) {
xMax = headline.getMinX();
}
}
if (headline.getMinY() < headline.getMaxY()) {
if (yMin == null || headline.getMinY() < yMin) {
yMin = headline.getMinY();
}
if (yMax == null || headline.getMaxY() > yMax) {
yMax = headline.getMaxY();
}
} else {
if (yMin == null || headline.getMaxY() < yMin) {
yMin = headline.getMaxY();
}
if (yMax == null || headline.getMinY() > yMax) {
yMax = headline.getMinY();
}
}
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
tocItem.getImages().add(image);
image.setAppendedToSection(true);
break;
}
}
if (!image.isAppendedToSection()) {
log.debug("Image uses last found section");
if (!lastFoundTOCItems.isEmpty()) {
lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
} else {
startImages.add(image);
}
image.setAppendedToSection(true);
}
}
if (!header.isEmpty()) {
headers.add(new ClassificationHeader(header));
}
if (!footer.isEmpty()) {
footers.add(new ClassificationFooter(footer));
}
last = tocItem;
lastItemsPerDepth.put(currentDepth, tocItem);
depths.add(currentDepth);
}
if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
SectionTreeEntry unassigned = new SectionTreeEntry(null);
unassigned.setSectionBlocks(startBlocks);
unassigned.setImages(startImages);
document.getSectionTree().getMainSections().add(0, unassigned);
}
document.setHeaders(headers);
document.setFooters(footers);
return new SectionTree(mainSections);
}
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0)
.stream()
.map(cell -> {
Cell fakeCell = Cell.copy(cell);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})
.toList();
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
}
private boolean hasValidHeaderInformation(TablePageBlock table) {
return !hasInvalidHeaderInformation(table);
}
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows()
return classificationDocument.getPages()
.stream()
.flatMap(Collection::stream)
.allMatch(cell -> cell.getHeaderCells().isEmpty());
}
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
if (row.size() == 1) {
continue;
}
boolean allNonHeader = true;
for (Cell cell : row) {
if (cell.isHeaderCell()) {
allNonHeader = false;
break;
}
}
if (allNonHeader) {
return row;
}
}
return Collections.emptyList();
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
}
}

View File

@ -0,0 +1,252 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class SectionTreeEnhancementService {
public void assignSectionBlocksAndImages(ClassificationDocument document) {
SectionTree toc = document.getSectionTree();
Iterator<SectionTreeEntry> iterator = toc.iterator();
SectionTreeEntry currentTOCItem = null;
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>();
SectionTreeEntry currentSection = null;
boolean foundFirstHeadline = false;
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null;
List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
for (ClassificationPage page : document.getPages()) {
List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
if (current.getClassification() == null) {
continue;
}
current.setPage(page.getPageNumber());
if (current.getClassification().equals(PageBlockType.HEADER)) {
header.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals(PageBlockType.FOOTER)) {
footer.add((TextPageBlock) current);
continue;
}
if (current instanceof TablePageBlock table) {
if (previousTable != null) {
mergeTableMetadata(table, previousTable);
}
previousTable = table;
}
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
if (!foundFirstHeadline) {
foundFirstHeadline = true;
}
currentSection = currentTOCItem;
currentTOCItem.getSectionBlocks().add(current);
currentPageTOCItems.add(currentTOCItem);
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
} else if (!foundFirstHeadline) {
startBlocks.add(current);
} else {
currentSection.getSectionBlocks().add(current);
}
}
if (!currentPageTOCItems.isEmpty()) {
lastFoundTOCItems = currentPageTOCItems;
}
for (ClassifiedImage image : page.getImages()) {
Double xMin = null;
Double yMin = null;
Double xMax = null;
Double yMax = null;
for (SectionTreeEntry tocItem : lastFoundTOCItems) {
var headline = tocItem.getHeadline();
if (headline.getPage() != page.getPageNumber()) {
continue;
}
if (headline.getMinX() < headline.getMaxX()) {
if (xMin == null || headline.getMinX() < xMin) {
xMin = headline.getMinX();
}
if (xMax == null || headline.getMaxX() > xMax) {
xMax = headline.getMaxX();
}
} else {
if (xMin == null || headline.getMaxX() < xMin) {
xMin = headline.getMaxX();
}
if (xMax == null || headline.getMinX() > xMax) {
xMax = headline.getMinX();
}
}
if (headline.getMinY() < headline.getMaxY()) {
if (yMin == null || headline.getMinY() < yMin) {
yMin = headline.getMinY();
}
if (yMax == null || headline.getMaxY() > yMax) {
yMax = headline.getMaxY();
}
} else {
if (yMin == null || headline.getMaxY() < yMin) {
yMin = headline.getMaxY();
}
if (yMax == null || headline.getMinY() > yMax) {
yMax = headline.getMinY();
}
}
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
tocItem.getImages().add(image);
image.setAppendedToSection(true);
break;
}
}
if (!image.isAppendedToSection()) {
log.debug("Image uses last found section");
if (!lastFoundTOCItems.isEmpty()) {
lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
} else {
startImages.add(image);
}
image.setAppendedToSection(true);
}
}
if (!header.isEmpty()) {
headers.add(new ClassificationHeader(header));
}
if (!footer.isEmpty()) {
footers.add(new ClassificationFooter(footer));
}
}
if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
SectionTreeEntry unassigned = new SectionTreeEntry(null);
unassigned.setSectionBlocks(startBlocks);
unassigned.setImages(startImages);
document.getSectionTree().getMainSections().add(0, unassigned);
}
document.setHeaders(headers);
document.setFooters(footers);
}
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0)
.stream()
.map(cell -> {
Cell fakeCell = Cell.copy(cell);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})
.toList();
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
}
private boolean hasValidHeaderInformation(TablePageBlock table) {
return !hasInvalidHeaderInformation(table);
}
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows()
.stream()
.flatMap(Collection::stream)
.allMatch(cell -> cell.getHeaderCells().isEmpty());
}
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
if (row.size() == 1) {
continue;
}
boolean allNonHeader = true;
for (Cell cell : row) {
if (cell.isHeaderCell()) {
allNonHeader = false;
break;
}
}
if (allNonHeader) {
return row;
}
}
return Collections.emptyList();
}
}