RED-9139: add new TableOfContents Node
* rename previous TableOfContent to SectionTree * added protobuf compile script
This commit is contained in:
parent
4b86307936
commit
621ebd7378
@ -39,8 +39,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
@ -105,8 +105,8 @@ public class LayoutParsingPipeline {
|
||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
GraphicExtractorService graphicExtractorService;
|
||||
OutlineExtractorService outlineExtractorService;
|
||||
OutlineValidationService outlineValidationService;
|
||||
SectionTreeBuilderService sectionTreeBuilderService;
|
||||
SectionTreeEnhancementService sectionTreeEnhancementService;
|
||||
LayoutparserSettings settings;
|
||||
ClassificationService classificationService;
|
||||
|
||||
@ -344,14 +344,14 @@ public class LayoutParsingPipeline {
|
||||
|
||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||
|
||||
SectionTree sectionTree = outlineValidationService.createSectionTree(classificationDocument);
|
||||
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
|
||||
classificationDocument.setSectionTree(sectionTree);
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||
default -> sectionTreeBuilderService.assignSectionBlocksAndImages(classificationDocument);
|
||||
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
|
||||
}
|
||||
|
||||
return classificationDocument;
|
||||
|
||||
@ -1,84 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class OutlineValidationService {
|
||||
|
||||
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
||||
public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
|
||||
|
||||
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
|
||||
|
||||
List<SectionTreeEntry> mainSections = new ArrayList<>();
|
||||
Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
|
||||
SectionTreeEntry last = null;
|
||||
TreeSet<Integer> depths = new TreeSet<>();
|
||||
|
||||
for (TextPageBlock current : headlines) {
|
||||
int currentDepth = getHeadlineNumber(current.getClassification());
|
||||
Integer parentDepth = depths.floor(currentDepth - 1);
|
||||
|
||||
var tocItem = new SectionTreeEntry(current);
|
||||
|
||||
if (parentDepth == null) {
|
||||
mainSections.add(tocItem);
|
||||
lastItemsPerDepth = new HashMap<>();
|
||||
depths = new TreeSet<>();
|
||||
|
||||
} else {
|
||||
assert last != null;
|
||||
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
|
||||
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
|
||||
// headline after toc should always start a main section
|
||||
parentDepth = 1;
|
||||
} else if (lastDepth < parentDepth) {
|
||||
parentDepth = lastDepth;
|
||||
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
||||
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
|
||||
}
|
||||
|
||||
SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
|
||||
parent.addChild(tocItem);
|
||||
}
|
||||
|
||||
last = tocItem;
|
||||
lastItemsPerDepth.put(currentDepth, tocItem);
|
||||
depths.add(currentDepth);
|
||||
}
|
||||
|
||||
return new
|
||||
|
||||
SectionTree(mainSections);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
|
||||
|
||||
return classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,252 +1,82 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@Slf4j
|
||||
public class SectionTreeBuilderService {
|
||||
|
||||
public void assignSectionBlocksAndImages(ClassificationDocument document) {
|
||||
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
||||
public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
|
||||
|
||||
SectionTree toc = document.getSectionTree();
|
||||
Iterator<SectionTreeEntry> iterator = toc.iterator();
|
||||
SectionTreeEntry currentTOCItem = null;
|
||||
if (iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
||||
List<ClassifiedImage> startImages = new ArrayList<>();
|
||||
SectionTreeEntry currentSection = null;
|
||||
boolean foundFirstHeadline = false;
|
||||
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
|
||||
|
||||
List<ClassificationHeader> headers = new ArrayList<>();
|
||||
List<ClassificationFooter> footers = new ArrayList<>();
|
||||
TablePageBlock previousTable = null;
|
||||
List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
|
||||
List<SectionTreeEntry> mainSections = new ArrayList<>();
|
||||
Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
|
||||
SectionTreeEntry last = null;
|
||||
TreeSet<Integer> depths = new TreeSet<>();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
|
||||
List<TextPageBlock> header = new ArrayList<>();
|
||||
List<TextPageBlock> footer = new ArrayList<>();
|
||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||
for (TextPageBlock current : headlines) {
|
||||
int currentDepth = getHeadlineNumber(current.getClassification());
|
||||
Integer parentDepth = depths.floor(currentDepth - 1);
|
||||
|
||||
if (current.getClassification() == null) {
|
||||
continue;
|
||||
var tocItem = new SectionTreeEntry(current);
|
||||
|
||||
if (parentDepth == null) {
|
||||
mainSections.add(tocItem);
|
||||
lastItemsPerDepth = new HashMap<>();
|
||||
depths = new TreeSet<>();
|
||||
|
||||
} else {
|
||||
assert last != null;
|
||||
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
|
||||
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
|
||||
// headline after toc should always start a main section
|
||||
parentDepth = 1;
|
||||
} else if (lastDepth < parentDepth) {
|
||||
parentDepth = lastDepth;
|
||||
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
||||
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
|
||||
}
|
||||
|
||||
current.setPage(page.getPageNumber());
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.HEADER)) {
|
||||
header.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.FOOTER)) {
|
||||
footer.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current instanceof TablePageBlock table) {
|
||||
if (previousTable != null) {
|
||||
mergeTableMetadata(table, previousTable);
|
||||
}
|
||||
previousTable = table;
|
||||
}
|
||||
|
||||
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
|
||||
if (!foundFirstHeadline) {
|
||||
foundFirstHeadline = true;
|
||||
}
|
||||
currentSection = currentTOCItem;
|
||||
currentTOCItem.getSectionBlocks().add(current);
|
||||
currentPageTOCItems.add(currentTOCItem);
|
||||
|
||||
if (iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
} else if (!foundFirstHeadline) {
|
||||
startBlocks.add(current);
|
||||
} else {
|
||||
currentSection.getSectionBlocks().add(current);
|
||||
}
|
||||
SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
|
||||
parent.addChild(tocItem);
|
||||
}
|
||||
|
||||
if (!currentPageTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems = currentPageTOCItems;
|
||||
}
|
||||
|
||||
for (ClassifiedImage image : page.getImages()) {
|
||||
|
||||
Double xMin = null;
|
||||
Double yMin = null;
|
||||
Double xMax = null;
|
||||
Double yMax = null;
|
||||
|
||||
for (SectionTreeEntry tocItem : lastFoundTOCItems) {
|
||||
var headline = tocItem.getHeadline();
|
||||
|
||||
if (headline.getPage() != page.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (headline.getMinX() < headline.getMaxX()) {
|
||||
if (xMin == null || headline.getMinX() < xMin) {
|
||||
xMin = headline.getMinX();
|
||||
}
|
||||
if (xMax == null || headline.getMaxX() > xMax) {
|
||||
xMax = headline.getMaxX();
|
||||
}
|
||||
} else {
|
||||
if (xMin == null || headline.getMaxX() < xMin) {
|
||||
xMin = headline.getMaxX();
|
||||
}
|
||||
if (xMax == null || headline.getMinX() > xMax) {
|
||||
xMax = headline.getMinX();
|
||||
}
|
||||
}
|
||||
|
||||
if (headline.getMinY() < headline.getMaxY()) {
|
||||
if (yMin == null || headline.getMinY() < yMin) {
|
||||
yMin = headline.getMinY();
|
||||
}
|
||||
if (yMax == null || headline.getMaxY() > yMax) {
|
||||
yMax = headline.getMaxY();
|
||||
}
|
||||
} else {
|
||||
if (yMin == null || headline.getMaxY() < yMin) {
|
||||
yMin = headline.getMaxY();
|
||||
}
|
||||
if (yMax == null || headline.getMinY() > yMax) {
|
||||
yMax = headline.getMinY();
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
tocItem.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!image.isAppendedToSection()) {
|
||||
log.debug("Image uses last found section");
|
||||
if (!lastFoundTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
|
||||
} else {
|
||||
startImages.add(image);
|
||||
}
|
||||
image.setAppendedToSection(true);
|
||||
}
|
||||
}
|
||||
|
||||
if (!header.isEmpty()) {
|
||||
headers.add(new ClassificationHeader(header));
|
||||
}
|
||||
if (!footer.isEmpty()) {
|
||||
footers.add(new ClassificationFooter(footer));
|
||||
}
|
||||
last = tocItem;
|
||||
lastItemsPerDepth.put(currentDepth, tocItem);
|
||||
depths.add(currentDepth);
|
||||
}
|
||||
|
||||
if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
|
||||
SectionTreeEntry unassigned = new SectionTreeEntry(null);
|
||||
unassigned.setSectionBlocks(startBlocks);
|
||||
unassigned.setImages(startImages);
|
||||
document.getSectionTree().getMainSections().add(0, unassigned);
|
||||
}
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
return new SectionTree(mainSections);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
|
||||
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = Cell.copy(cell);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
})
|
||||
.toList();
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean hasValidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return !hasInvalidHeaderInformation(table);
|
||||
}
|
||||
|
||||
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows()
|
||||
return classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty());
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
boolean allNonHeader = true;
|
||||
for (Cell cell : row) {
|
||||
if (cell.isHeaderCell()) {
|
||||
allNonHeader = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allNonHeader) {
|
||||
return row;
|
||||
}
|
||||
}
|
||||
|
||||
return Collections.emptyList();
|
||||
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,252 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class SectionTreeEnhancementService {
|
||||
|
||||
public void assignSectionBlocksAndImages(ClassificationDocument document) {
|
||||
|
||||
SectionTree toc = document.getSectionTree();
|
||||
Iterator<SectionTreeEntry> iterator = toc.iterator();
|
||||
SectionTreeEntry currentTOCItem = null;
|
||||
if (iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
||||
List<ClassifiedImage> startImages = new ArrayList<>();
|
||||
SectionTreeEntry currentSection = null;
|
||||
boolean foundFirstHeadline = false;
|
||||
|
||||
List<ClassificationHeader> headers = new ArrayList<>();
|
||||
List<ClassificationFooter> footers = new ArrayList<>();
|
||||
TablePageBlock previousTable = null;
|
||||
List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
|
||||
List<TextPageBlock> header = new ArrayList<>();
|
||||
List<TextPageBlock> footer = new ArrayList<>();
|
||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||
|
||||
if (current.getClassification() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
current.setPage(page.getPageNumber());
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.HEADER)) {
|
||||
header.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.FOOTER)) {
|
||||
footer.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current instanceof TablePageBlock table) {
|
||||
if (previousTable != null) {
|
||||
mergeTableMetadata(table, previousTable);
|
||||
}
|
||||
previousTable = table;
|
||||
}
|
||||
|
||||
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
|
||||
if (!foundFirstHeadline) {
|
||||
foundFirstHeadline = true;
|
||||
}
|
||||
currentSection = currentTOCItem;
|
||||
currentTOCItem.getSectionBlocks().add(current);
|
||||
currentPageTOCItems.add(currentTOCItem);
|
||||
|
||||
if (iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
} else if (!foundFirstHeadline) {
|
||||
startBlocks.add(current);
|
||||
} else {
|
||||
currentSection.getSectionBlocks().add(current);
|
||||
}
|
||||
}
|
||||
|
||||
if (!currentPageTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems = currentPageTOCItems;
|
||||
}
|
||||
|
||||
for (ClassifiedImage image : page.getImages()) {
|
||||
|
||||
Double xMin = null;
|
||||
Double yMin = null;
|
||||
Double xMax = null;
|
||||
Double yMax = null;
|
||||
|
||||
for (SectionTreeEntry tocItem : lastFoundTOCItems) {
|
||||
var headline = tocItem.getHeadline();
|
||||
|
||||
if (headline.getPage() != page.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (headline.getMinX() < headline.getMaxX()) {
|
||||
if (xMin == null || headline.getMinX() < xMin) {
|
||||
xMin = headline.getMinX();
|
||||
}
|
||||
if (xMax == null || headline.getMaxX() > xMax) {
|
||||
xMax = headline.getMaxX();
|
||||
}
|
||||
} else {
|
||||
if (xMin == null || headline.getMaxX() < xMin) {
|
||||
xMin = headline.getMaxX();
|
||||
}
|
||||
if (xMax == null || headline.getMinX() > xMax) {
|
||||
xMax = headline.getMinX();
|
||||
}
|
||||
}
|
||||
|
||||
if (headline.getMinY() < headline.getMaxY()) {
|
||||
if (yMin == null || headline.getMinY() < yMin) {
|
||||
yMin = headline.getMinY();
|
||||
}
|
||||
if (yMax == null || headline.getMaxY() > yMax) {
|
||||
yMax = headline.getMaxY();
|
||||
}
|
||||
} else {
|
||||
if (yMin == null || headline.getMaxY() < yMin) {
|
||||
yMin = headline.getMaxY();
|
||||
}
|
||||
if (yMax == null || headline.getMinY() > yMax) {
|
||||
yMax = headline.getMinY();
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
tocItem.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!image.isAppendedToSection()) {
|
||||
log.debug("Image uses last found section");
|
||||
if (!lastFoundTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
|
||||
} else {
|
||||
startImages.add(image);
|
||||
}
|
||||
image.setAppendedToSection(true);
|
||||
}
|
||||
}
|
||||
|
||||
if (!header.isEmpty()) {
|
||||
headers.add(new ClassificationHeader(header));
|
||||
}
|
||||
if (!footer.isEmpty()) {
|
||||
footers.add(new ClassificationFooter(footer));
|
||||
}
|
||||
}
|
||||
|
||||
if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
|
||||
SectionTreeEntry unassigned = new SectionTreeEntry(null);
|
||||
unassigned.setSectionBlocks(startBlocks);
|
||||
unassigned.setImages(startImages);
|
||||
document.getSectionTree().getMainSections().add(0, unassigned);
|
||||
}
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = Cell.copy(cell);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
})
|
||||
.toList();
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean hasValidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return !hasInvalidHeaderInformation(table);
|
||||
}
|
||||
|
||||
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty());
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
boolean allNonHeader = true;
|
||||
for (Cell cell : row) {
|
||||
if (cell.isHeaderCell()) {
|
||||
allNonHeader = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allNonHeader) {
|
||||
return row;
|
||||
}
|
||||
}
|
||||
|
||||
return Collections.emptyList();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user