RED-7074: Design Subsection section tree structure algorithm
* added abstract class SectionNode * both Section and SuperSection extend the SectionNode class, so that there is no inheritance between Section and SuperSection as well as no field duplication
This commit is contained in:
parent
b08ed2037e
commit
0c8b2e6d44
@ -6,6 +6,7 @@ import java.util.Locale;
|
||||
public enum NodeType implements Serializable {
|
||||
DOCUMENT,
|
||||
SECTION,
|
||||
SUPER_SECTION,
|
||||
HEADLINE,
|
||||
PARAGRAPH,
|
||||
TABLE,
|
||||
|
||||
@ -21,81 +21,17 @@ import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Section implements GenericSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
List<Integer> treeId;
|
||||
|
||||
TextBlock textBlock;
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.SECTION;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasTables() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
public boolean isLeafSection() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.SECTION).findAny()
|
||||
.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class Section extends SectionNode {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE)//
|
||||
.map(node -> (Headline) node)//
|
||||
.findFirst()//
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
return super.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,103 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public abstract class SectionNode implements GenericSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
List<Integer> treeId;
|
||||
|
||||
TextBlock textBlock;
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.SECTION;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasTables() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
public boolean isLeafSection() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.SECTION).findAny()
|
||||
.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE)//
|
||||
.map(node -> (Headline) node)//
|
||||
.findFirst()//
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,14 +1,35 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.ToString;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class SuperSection extends Section {
|
||||
public class SuperSection extends SectionNode {
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.SUPER_SECTION;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
@ -6,6 +6,7 @@ import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
@ -24,7 +25,7 @@ public class TableOfContentItem {
|
||||
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private Section section;
|
||||
private SectionNode section;
|
||||
|
||||
|
||||
public TableOfContentItem(TextPageBlock headline) {
|
||||
|
||||
@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
||||
@ -74,7 +75,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
||||
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType,
|
||||
Optional<SectionNode> section = SectionNodeFactory.addSection(layoutParsingType,
|
||||
parent,
|
||||
tocItem.getChildren().isEmpty(),
|
||||
tocItem.getNonEmptySectionBlocks(),
|
||||
@ -239,7 +240,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
DocumentTree documentTree;
|
||||
Map<Page, Integer> pages;
|
||||
List<Section> sections;
|
||||
List<SectionNode> sections;
|
||||
List<ClassifiedImage> images;
|
||||
TextBlockFactory textBlockFactory;
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
@ -29,7 +30,7 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class SectionNodeFactory {
|
||||
|
||||
public Optional<Section> addSection(LayoutParsingType layoutParsingType,
|
||||
public Optional<SectionNode> addSection(LayoutParsingType layoutParsingType,
|
||||
GenericSemanticNode parentNode,
|
||||
boolean isLeaf,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
@ -53,7 +54,7 @@ public class SectionNodeFactory {
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||
|
||||
Section section;
|
||||
SectionNode section;
|
||||
if (isLeaf) {
|
||||
section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
@ -98,7 +99,7 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
|
||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, SectionNode section) {
|
||||
|
||||
if (parentNode == null) {
|
||||
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
|
||||
@ -111,7 +112,7 @@ public class SectionNodeFactory {
|
||||
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
Section section,
|
||||
SectionNode section,
|
||||
Document document) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
@ -124,7 +125,7 @@ public class SectionNodeFactory {
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
Section section,
|
||||
SectionNode section,
|
||||
Document document) {
|
||||
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
@ -249,7 +250,7 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
|
||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, SectionNode section, Integer pageNumber) {
|
||||
|
||||
Page page = context.getPage(pageNumber);
|
||||
page.getMainBody().add(section);
|
||||
|
||||
@ -61,7 +61,7 @@ public class DocumentGraphMapper {
|
||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case SECTION, SUPER_SECTION -> buildSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
|
||||
case HEADLINE -> buildHeadline(context);
|
||||
case HEADER -> buildHeader(context);
|
||||
|
||||
@ -112,8 +112,8 @@ public class PdfVisualisationUtility {
|
||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
case SUPER_SECTION, SECTION -> Color.BLACK;
|
||||
case HEADLINE -> Color.RED;
|
||||
case SECTION -> Color.BLACK;
|
||||
case TABLE -> Color.ORANGE;
|
||||
case TABLE_CELL -> Color.GRAY;
|
||||
case IMAGE -> Color.MAGENTA;
|
||||
|
||||
@ -229,7 +229,7 @@ public class PdfDraw {
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
case HEADLINE -> Color.RED;
|
||||
case SECTION -> Color.BLACK;
|
||||
case SECTION, SUPER_SECTION -> Color.BLACK;
|
||||
case TABLE -> Color.ORANGE;
|
||||
case TABLE_CELL -> Color.GRAY;
|
||||
case IMAGE -> Color.MAGENTA;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user