RED-7074: Design Subsection section tree structure algorithm

* added abstract class SectionNode
* both Section and SuperSection extend the SectionNode class, so that there is no inheritance between Section and SuperSection as well as no field duplication
This commit is contained in:
maverickstuder 2024-05-22 13:02:16 +02:00
parent b08ed2037e
commit 0c8b2e6d44
10 changed files with 144 additions and 80 deletions

View File

@ -6,6 +6,7 @@ import java.util.Locale;
public enum NodeType implements Serializable { public enum NodeType implements Serializable {
DOCUMENT, DOCUMENT,
SECTION, SECTION,
SUPER_SECTION,
HEADLINE, HEADLINE,
PARAGRAPH, PARAGRAPH,
TABLE, TABLE,

View File

@ -21,81 +21,17 @@ import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder; import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data @Data
@SuperBuilder @SuperBuilder
@AllArgsConstructor @AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE) @FieldDefaults(level = AccessLevel.PRIVATE)
public class Section implements GenericSemanticNode { @EqualsAndHashCode(callSuper = true)
public class Section extends SectionNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
return NodeType.SECTION;
}
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
public boolean isLeafSection() {
return streamAllSubNodesOfType(NodeType.SECTION).findAny()
.isEmpty();
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override @Override
public String toString() { public String toString() {
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary(); return super.toString();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
} }
} }

View File

@ -0,0 +1,103 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public abstract class SectionNode implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
return NodeType.SECTION;
}
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
public boolean isLeafSection() {
return streamAllSubNodesOfType(NodeType.SECTION).findAny()
.isEmpty();
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,14 +1,35 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
import lombok.ToString; import lombok.ToString;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder; import lombok.experimental.SuperBuilder;
@Data @Data
@SuperBuilder @SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true) @EqualsAndHashCode(callSuper = true)
public class SuperSection extends Section { public class SuperSection extends SectionNode {
@Override
public NodeType getType() {
return NodeType.SUPER_SECTION;
}
@Override @Override
public String toString() { public String toString() {

View File

@ -6,6 +6,7 @@ import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -24,7 +25,7 @@ public class TableOfContentItem {
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>(); private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>(); private List<ClassifiedImage> images = new ArrayList<>();
private Section section; private SectionNode section;
public TableOfContentItem(TextPageBlock headline) { public TableOfContentItem(TextPageBlock headline) {

View File

@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
@ -74,7 +75,7 @@ public class DocumentGraphFactory {
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection(); var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType, Optional<SectionNode> section = SectionNodeFactory.addSection(layoutParsingType,
parent, parent,
tocItem.getChildren().isEmpty(), tocItem.getChildren().isEmpty(),
tocItem.getNonEmptySectionBlocks(), tocItem.getNonEmptySectionBlocks(),
@ -239,7 +240,7 @@ public class DocumentGraphFactory {
DocumentTree documentTree; DocumentTree documentTree;
Map<Page, Integer> pages; Map<Page, Integer> pages;
List<Section> sections; List<SectionNode> sections;
List<ClassifiedImage> images; List<ClassifiedImage> images;
TextBlockFactory textBlockFactory; TextBlockFactory textBlockFactory;

View File

@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
@ -29,7 +30,7 @@ import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class SectionNodeFactory { public class SectionNodeFactory {
public Optional<Section> addSection(LayoutParsingType layoutParsingType, public Optional<SectionNode> addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode, GenericSemanticNode parentNode,
boolean isLeaf, boolean isLeaf,
List<AbstractPageBlock> pageBlocks, List<AbstractPageBlock> pageBlocks,
@ -53,7 +54,7 @@ public class SectionNodeFactory {
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream() Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage)); .collect(groupingBy(AbstractPageBlock::getPage));
Section section; SectionNode section;
if (isLeaf) { if (isLeaf) {
section = Section.builder().documentTree(context.getDocumentTree()).build(); section = Section.builder().documentTree(context.getDocumentTree()).build();
} else { } else {
@ -98,7 +99,7 @@ public class SectionNodeFactory {
} }
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) { private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, SectionNode section) {
if (parentNode == null) { if (parentNode == null) {
return context.getDocumentTree().createNewMainEntryAndReturnId(section); return context.getDocumentTree().createNewMainEntryAndReturnId(section);
@ -111,7 +112,7 @@ public class SectionNodeFactory {
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks, List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context, DocumentGraphFactory.Context context,
Section section, SectionNode section,
Document document) { Document document) {
if (pageBlocks.get(0).isHeadline()) { if (pageBlocks.get(0).isHeadline()) {
@ -124,7 +125,7 @@ public class SectionNodeFactory {
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType, private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks, List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context, DocumentGraphFactory.Context context,
Section section, SectionNode section,
Document document) { Document document) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>(); Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
@ -249,7 +250,7 @@ public class SectionNodeFactory {
} }
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) { private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, SectionNode section, Integer pageNumber) {
Page page = context.getPage(pageNumber); Page page = context.getPage(pageNumber);
page.getMainBody().add(section); page.getMainBody().add(section);

View File

@ -61,7 +61,7 @@ public class DocumentGraphMapper {
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList(); List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
SemanticNode node = switch (entryData.getType()) { SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context); case SECTION, SUPER_SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties()); case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context); case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context); case HEADER -> buildHeader(context);

View File

@ -112,8 +112,8 @@ public class PdfVisualisationUtility {
case DOCUMENT -> Color.LIGHT_GRAY; case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN; case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE; case PARAGRAPH -> Color.BLUE;
case SUPER_SECTION, SECTION -> Color.BLACK;
case HEADLINE -> Color.RED; case HEADLINE -> Color.RED;
case SECTION -> Color.BLACK;
case TABLE -> Color.ORANGE; case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY; case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA; case IMAGE -> Color.MAGENTA;

View File

@ -229,7 +229,7 @@ public class PdfDraw {
case HEADER, FOOTER -> Color.GREEN; case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE; case PARAGRAPH -> Color.BLUE;
case HEADLINE -> Color.RED; case HEADLINE -> Color.RED;
case SECTION -> Color.BLACK; case SECTION, SUPER_SECTION -> Color.BLACK;
case TABLE -> Color.ORANGE; case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY; case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA; case IMAGE -> Color.MAGENTA;