RED-10380: wip add table of contents node type

This commit is contained in:
Kilian Schuettler 2024-11-04 16:39:08 +01:00
parent 124afb3623
commit 8e4cfa2047
11 changed files with 183 additions and 17 deletions

View File

@ -14,6 +14,7 @@ public enum PageBlockType {
PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN,
OTHER,
TABLE_OF_CONTENTS_HEADLINE,
TABLE_OF_CONTENTS_ITEM,
LIST_ITEM,
TABLE;
@ -35,7 +36,7 @@ public enum PageBlockType {
public static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1 -> 1;
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
@ -47,6 +48,6 @@ public enum PageBlockType {
public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
}
}

View File

@ -11,6 +11,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
public abstract class AbstractNodeVisitor implements NodeVisitor {
@ -83,6 +85,18 @@ public abstract class AbstractNodeVisitor implements NodeVisitor {
visitChildren(tableCell);
}
@Override
public void visit(TableOfContents toc) {
visitChildren(toc);
}
@Override
public void visit(TableOfContentsItem toci) {
visitChildren(toci);
}
protected void visitChildren(SemanticNode semanticNode) {

View File

@ -10,6 +10,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
import software.amazon.awssdk.utils.builder.ToCopyableBuilder;
public interface NodeVisitor {
@ -42,4 +46,10 @@ public interface NodeVisitor {
void visit(TableCell tableCell);
void visit(TableOfContents tableOfContents);
void visit(TableOfContentsItem tableOfContentsItem);
}

View File

@ -0,0 +1,41 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class TableOfContents extends AbstractSemanticNode {
@Override
public NodeTypeProto.NodeType getType() {
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS;
}
public Headline getHeadline() {
return streamChildrenOfType(NodeTypeProto.NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(() -> getParent().getHeadline());
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -0,0 +1,51 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class TableOfContentsItem extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeTypeProto.NodeType getType() {
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS_ITEM;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
}

View File

@ -59,7 +59,7 @@ public class TableOfContentsClassificationService {
if (end > i + 1) {
if (textBlock.textBlock().getClassification() == null) {
textBlock.textBlock().setClassification(PageBlockType.H1);
textBlock.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_HEADLINE);
}
i = end;
}
@ -71,9 +71,9 @@ public class TableOfContentsClassificationService {
ClassificationPage startPage = textBlocks.get(start).page();
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
List<Word> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
HashMap<Word, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
List<Word> numbers = extractNumbers(initialLookAhead, numberToBlockLookup, document.getPages().size());
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, numberToBlockLookup);
int lastCandidate = start;
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
@ -93,7 +93,7 @@ public class TableOfContentsClassificationService {
break;
}
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, numberToBlockLookup, document.getPages().size());
List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
@ -102,19 +102,19 @@ public class TableOfContentsClassificationService {
return start;
}
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
if (anyIntersection(currentRightmostCluster, numbersFromBlock, numberToBlockLookup)) {
lastCandidate = i;
numbersFromBlock.forEach(tocNumberFinder::add);
}
}
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
.stream()
.map(lookup::get)
.map(numberToBlockLookup::get)
.collect(Collectors.toSet());
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, numberToBlockLookup, blocksWithNumberInCluster, textBlocks.get(start - 1));
int lastConfirmed = start;
for (int i = start; i < lastCandidate + 1; i++) {
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
@ -132,18 +132,22 @@ public class TableOfContentsClassificationService {
}
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<Word, TextBlockOnPage> lookup) {
private static void addVisualization(LayoutDebugLayer layoutDebugLayer,
TocNumberFinder tocNumberFinder,
Map<Word, TextBlockOnPage> lookup,
Set<TextBlockOnPage> blocksWithNumberInCluster,
TextBlockOnPage startingHeadline) {
tocNumberFinder.getCurrentRightmostCluster()
.stream()
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
layoutDebugLayer.addTocBlocks(blocksWithNumberInCluster);
layoutDebugLayer.addTocBlocks(Set.of(startingHeadline));
}
private static boolean anyIntersection(Collection<Word> numbers1,
Collection<Word> numbers2,
Map<Word, TextBlockOnPage> lookup) {
private static boolean anyIntersection(Collection<Word> numbers1, Collection<Word> numbers2, Map<Word, TextBlockOnPage> lookup) {
return numbers1.stream()
.anyMatch(numberFromCluster -> numbers2.stream()

View File

@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
@ -35,6 +36,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
@ -96,6 +98,7 @@ public class DocumentGraphFactory {
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
parent,
tocItem.getChildren().isEmpty(),
@ -121,6 +124,8 @@ public class DocumentGraphFactory {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM)) {
node = TableOfContentsItem.builder().documentTree(context.getDocumentTree()).build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
}
@ -142,7 +147,9 @@ public class DocumentGraphFactory {
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
node.setLeafTextBlock(textBlock);
node.setTreeId(treeId);
node.getEngines().addAll(originalTextBlock.getEngines());
node.getEngines().
addAll(originalTextBlock.getEngines());
}

View File

@ -12,11 +12,13 @@ import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -27,6 +29,20 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class SectionNodeFactory {
public GenericSemanticNode addTocSection(LayoutParsingType layoutParsingType, List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Document document) {
AbstractSemanticNode section = TableOfContents.builder().documentTree(context.getDocumentTree()).build();
context.getSections().add(section);
section.setTreeId(getTreeId(null, context, section));
for (AbstractPageBlock pageBlock : pageBlocks) {
if (pageBlock instanceof TextPageBlock textPageBlock) {
DocumentGraphFactory.addParagraphOrHeadline(section, textPageBlock, context, new ArrayList<>(), layoutParsingType);
}
}
return section;
}
public Optional<GenericSemanticNode> addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
boolean isLeaf,

View File

@ -27,6 +27,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
@ -332,8 +333,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
if (!active) {
return;
}
int rectSize = 5;
Point2D point2D;
if (outlineObject.getPoint().isPresent()) {
point2D = outlineObject.getPoint().get();
@ -357,10 +360,25 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
if (!active) {
return;
}
for (ListIdentifier listIdentifier : listIdentifiers) {
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
}
}
public void addTocBlocks(Set<TextBlockOnPage> blocksWithNumberInCluster) {
if (!active) {
return;
}
for (TextBlockOnPage textBlockOnPage : blocksWithNumberInCluster) {
getOrCreateVisualizationsOnPage(textBlockOnPage.page().getPageNumber(), this.tocBlocks).getColoredRectangles()
.add(new ColoredRectangle(textBlockOnPage.textBlock().getBBoxPdf(), TOC_COLOR, LINE_WIDTH));
}
}
}

View File

@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) {
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
public static final LayerIdentifier TOC_BLOCKS = new LayerIdentifier("TOC blocks", "TOC_BLOCKS");
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
// Visual layout parser

View File

@ -22,6 +22,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
protected static final Color LINES_COLOR = new Color(152, 45, 179);
protected static final Color TOC_COLOR = new Color(33, 159, 144);
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
@ -59,6 +60,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
protected final Visualizations tocBlocks = Visualizations.builder().layer(LayerIdentifier.TOC_BLOCKS).build();
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
@ -77,6 +79,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
markedContent, //
outlineObjects, //
tocPages, //
tocBlocks, //
listIdentifiers //
);
}