RED-10380: wip add table of contents node type
This commit is contained in:
parent
124afb3623
commit
8e4cfa2047
@ -14,6 +14,7 @@ public enum PageBlockType {
|
||||
PARAGRAPH_ITALIC,
|
||||
PARAGRAPH_UNKNOWN,
|
||||
OTHER,
|
||||
TABLE_OF_CONTENTS_HEADLINE,
|
||||
TABLE_OF_CONTENTS_ITEM,
|
||||
LIST_ITEM,
|
||||
TABLE;
|
||||
@ -35,7 +36,7 @@ public enum PageBlockType {
|
||||
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||
|
||||
return switch (pageBlockType) {
|
||||
case H1 -> 1;
|
||||
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
|
||||
case H2 -> 2;
|
||||
case H3 -> 3;
|
||||
case H4 -> 4;
|
||||
@ -47,6 +48,6 @@ public enum PageBlockType {
|
||||
|
||||
public boolean isHeadline() {
|
||||
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -11,6 +11,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||
|
||||
public abstract class AbstractNodeVisitor implements NodeVisitor {
|
||||
|
||||
@ -83,6 +85,18 @@ public abstract class AbstractNodeVisitor implements NodeVisitor {
|
||||
visitChildren(tableCell);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(TableOfContents toc) {
|
||||
|
||||
visitChildren(toc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(TableOfContentsItem toci) {
|
||||
|
||||
visitChildren(toci);
|
||||
}
|
||||
|
||||
|
||||
protected void visitChildren(SemanticNode semanticNode) {
|
||||
|
||||
|
||||
@ -10,6 +10,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||
|
||||
import software.amazon.awssdk.utils.builder.ToCopyableBuilder;
|
||||
|
||||
public interface NodeVisitor {
|
||||
|
||||
@ -42,4 +46,10 @@ public interface NodeVisitor {
|
||||
|
||||
void visit(TableCell tableCell);
|
||||
|
||||
|
||||
void visit(TableOfContents tableOfContents);
|
||||
|
||||
|
||||
void visit(TableOfContentsItem tableOfContentsItem);
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,41 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class TableOfContents extends AbstractSemanticNode {
|
||||
|
||||
@Override
|
||||
public NodeTypeProto.NodeType getType() {
|
||||
|
||||
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS;
|
||||
}
|
||||
|
||||
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeTypeProto.NodeType.HEADLINE).map(node -> (Headline) node)
|
||||
.findFirst()
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,51 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class TableOfContentsItem extends AbstractSemanticNode {
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeTypeProto.NodeType getType() {
|
||||
|
||||
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS_ITEM;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
}
|
||||
@ -59,7 +59,7 @@ public class TableOfContentsClassificationService {
|
||||
|
||||
if (end > i + 1) {
|
||||
if (textBlock.textBlock().getClassification() == null) {
|
||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
||||
textBlock.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_HEADLINE);
|
||||
}
|
||||
i = end;
|
||||
}
|
||||
@ -71,9 +71,9 @@ public class TableOfContentsClassificationService {
|
||||
|
||||
ClassificationPage startPage = textBlocks.get(start).page();
|
||||
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
|
||||
List<Word> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
|
||||
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
|
||||
HashMap<Word, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
|
||||
List<Word> numbers = extractNumbers(initialLookAhead, numberToBlockLookup, document.getPages().size());
|
||||
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, numberToBlockLookup);
|
||||
|
||||
int lastCandidate = start;
|
||||
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
||||
@ -93,7 +93,7 @@ public class TableOfContentsClassificationService {
|
||||
break;
|
||||
}
|
||||
|
||||
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
|
||||
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, numberToBlockLookup, document.getPages().size());
|
||||
|
||||
List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
||||
|
||||
@ -102,19 +102,19 @@ public class TableOfContentsClassificationService {
|
||||
return start;
|
||||
}
|
||||
|
||||
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
|
||||
if (anyIntersection(currentRightmostCluster, numbersFromBlock, numberToBlockLookup)) {
|
||||
lastCandidate = i;
|
||||
numbersFromBlock.forEach(tocNumberFinder::add);
|
||||
}
|
||||
}
|
||||
|
||||
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
|
||||
|
||||
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
||||
.stream()
|
||||
.map(lookup::get)
|
||||
.map(numberToBlockLookup::get)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, numberToBlockLookup, blocksWithNumberInCluster, textBlocks.get(start - 1));
|
||||
|
||||
int lastConfirmed = start;
|
||||
for (int i = start; i < lastCandidate + 1; i++) {
|
||||
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
||||
@ -132,18 +132,22 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<Word, TextBlockOnPage> lookup) {
|
||||
private static void addVisualization(LayoutDebugLayer layoutDebugLayer,
|
||||
TocNumberFinder tocNumberFinder,
|
||||
Map<Word, TextBlockOnPage> lookup,
|
||||
Set<TextBlockOnPage> blocksWithNumberInCluster,
|
||||
TextBlockOnPage startingHeadline) {
|
||||
|
||||
tocNumberFinder.getCurrentRightmostCluster()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
||||
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
||||
layoutDebugLayer.addTocBlocks(blocksWithNumberInCluster);
|
||||
layoutDebugLayer.addTocBlocks(Set.of(startingHeadline));
|
||||
}
|
||||
|
||||
|
||||
private static boolean anyIntersection(Collection<Word> numbers1,
|
||||
Collection<Word> numbers2,
|
||||
Map<Word, TextBlockOnPage> lookup) {
|
||||
private static boolean anyIntersection(Collection<Word> numbers1, Collection<Word> numbers2, Map<Word, TextBlockOnPage> lookup) {
|
||||
|
||||
return numbers1.stream()
|
||||
.anyMatch(numberFromCluster -> numbers2.stream()
|
||||
|
||||
@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
@ -35,6 +36,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
@ -96,6 +98,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||
GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
||||
|
||||
Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
|
||||
parent,
|
||||
tocItem.getChildren().isEmpty(),
|
||||
@ -121,6 +124,8 @@ public class DocumentGraphFactory {
|
||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
|
||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else if (originalTextBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM)) {
|
||||
node = TableOfContentsItem.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
@ -142,7 +147,9 @@ public class DocumentGraphFactory {
|
||||
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
node.setTreeId(treeId);
|
||||
node.getEngines().addAll(originalTextBlock.getEngines());
|
||||
node.getEngines().
|
||||
|
||||
addAll(originalTextBlock.getEngines());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -12,11 +12,13 @@ import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -27,6 +29,20 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class SectionNodeFactory {
|
||||
|
||||
public GenericSemanticNode addTocSection(LayoutParsingType layoutParsingType, List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Document document) {
|
||||
|
||||
AbstractSemanticNode section = TableOfContents.builder().documentTree(context.getDocumentTree()).build();
|
||||
context.getSections().add(section);
|
||||
section.setTreeId(getTreeId(null, context, section));
|
||||
for (AbstractPageBlock pageBlock : pageBlocks) {
|
||||
if (pageBlock instanceof TextPageBlock textPageBlock) {
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, textPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||
}
|
||||
}
|
||||
return section;
|
||||
}
|
||||
|
||||
|
||||
public Optional<GenericSemanticNode> addSection(LayoutParsingType layoutParsingType,
|
||||
GenericSemanticNode parentNode,
|
||||
boolean isLeaf,
|
||||
|
||||
@ -27,6 +27,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
@ -332,8 +333,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
|
||||
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
int rectSize = 5;
|
||||
|
||||
Point2D point2D;
|
||||
if (outlineObject.getPoint().isPresent()) {
|
||||
point2D = outlineObject.getPoint().get();
|
||||
@ -357,10 +360,25 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
|
||||
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
for (ListIdentifier listIdentifier : listIdentifiers) {
|
||||
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
||||
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addTocBlocks(Set<TextBlockOnPage> blocksWithNumberInCluster) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
for (TextBlockOnPage textBlockOnPage : blocksWithNumberInCluster) {
|
||||
getOrCreateVisualizationsOnPage(textBlockOnPage.page().getPageNumber(), this.tocBlocks).getColoredRectangles()
|
||||
.add(new ColoredRectangle(textBlockOnPage.textBlock().getBBoxPdf(), TOC_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) {
|
||||
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
||||
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
||||
public static final LayerIdentifier TOC_BLOCKS = new LayerIdentifier("TOC blocks", "TOC_BLOCKS");
|
||||
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
||||
|
||||
// Visual layout parser
|
||||
|
||||
@ -22,6 +22,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
|
||||
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||
protected static final Color TOC_COLOR = new Color(33, 159, 144);
|
||||
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||
|
||||
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||
@ -59,6 +60,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
||||
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
||||
protected final Visualizations tocBlocks = Visualizations.builder().layer(LayerIdentifier.TOC_BLOCKS).build();
|
||||
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
||||
|
||||
|
||||
@ -77,6 +79,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
markedContent, //
|
||||
outlineObjects, //
|
||||
tocPages, //
|
||||
tocBlocks, //
|
||||
listIdentifiers //
|
||||
);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user