RED-7074: Design Subsection section tree structure algorithm
This commit is contained in:
parent
546341ee75
commit
48b7a22e2b
@ -6,6 +6,7 @@ import java.util.Locale;
|
||||
public enum NodeType implements Serializable {
|
||||
DOCUMENT,
|
||||
SECTION,
|
||||
SUPER_SECTION,
|
||||
HEADLINE,
|
||||
PARAGRAPH,
|
||||
TABLE,
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
@ -29,6 +30,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -45,6 +51,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
@ -90,12 +97,16 @@ public class LayoutParsingPipeline {
|
||||
TableExtractionService tableExtractionService;
|
||||
DocuMineBlockificationService docuMineBlockificationService;
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
BlockificationPostprocessingService blockificationPostprocessingService;
|
||||
DocstrumBlockificationService docstrumBlockificationService;
|
||||
LayoutGridService layoutGridService;
|
||||
ObservationRegistry observationRegistry;
|
||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
ClarifyndClassificationService clarifyndClassificationService;
|
||||
GraphicExtractorService graphicExtractorService;
|
||||
OutlineExtractorService outlineExtractorService;
|
||||
OutlineValidationService outlineValidationService;
|
||||
TOCEnrichmentService tocEnrichmentService;
|
||||
LayoutparserSettings settings;
|
||||
|
||||
|
||||
@ -123,8 +134,10 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
if (layoutParsingRequest.tablesFileStorageId()
|
||||
.isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||
.get());
|
||||
}
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||
@ -204,15 +217,15 @@ public class LayoutParsingPipeline {
|
||||
|
||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||
|
||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
}
|
||||
|
||||
|
||||
@ -227,6 +240,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
PDDocument originDocument = openDocument(originFile);
|
||||
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||
@ -237,6 +251,12 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
OutlineObject lastProcessedOutlineObject = null;
|
||||
|
||||
// parsing the structure elements could be useful as well
|
||||
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
}
|
||||
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
@ -282,7 +302,13 @@ public class LayoutParsingPipeline {
|
||||
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
||||
pdPage,
|
||||
pageNumber,
|
||||
cleanRulings,
|
||||
stripper.getTextPositionSequences(),
|
||||
|
||||
false);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
@ -306,6 +332,20 @@ public class LayoutParsingPipeline {
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||
|
||||
OutlineObject notFoundOutlineObject = null;
|
||||
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
|
||||
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||
}
|
||||
if (!outlineObjects.isEmpty()) {
|
||||
classificationPage.setOutlineObjects(outlineObjects);
|
||||
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||
}
|
||||
}
|
||||
|
||||
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||
@ -347,14 +387,21 @@ public class LayoutParsingPipeline {
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
|
||||
classificationDocument.setTableOfContents(tableOfContents);
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||
default -> {
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
}
|
||||
default -> tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
|
||||
}
|
||||
|
||||
return classificationDocument;
|
||||
|
||||
@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
@ -28,4 +30,7 @@ public class ClassificationDocument {
|
||||
|
||||
private long rulesVersion;
|
||||
|
||||
private OutlineObjectTree outlineObjectTree;
|
||||
private TableOfContents tableOfContents;
|
||||
|
||||
}
|
||||
|
||||
@ -8,13 +8,13 @@ import java.util.Map;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@ -23,6 +23,10 @@ public class ClassificationPage {
|
||||
@NonNull
|
||||
private List<AbstractPageBlock> textBlocks;
|
||||
|
||||
private List<OutlineObject> outlineObjects = new ArrayList<>();
|
||||
|
||||
private List<AbstractPageBlock> headlines = new ArrayList<>();
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private Rectangle bodyTextFrame;
|
||||
|
||||
@ -12,6 +12,7 @@ import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@Deprecated
|
||||
public class ClassificationSection {
|
||||
|
||||
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||
|
||||
@ -31,6 +31,19 @@ public enum PageBlockType {
|
||||
}
|
||||
|
||||
|
||||
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||
|
||||
return switch (pageBlockType) {
|
||||
case H1 -> 1;
|
||||
case H2 -> 2;
|
||||
case H3 -> 3;
|
||||
case H4 -> 4;
|
||||
case H5 -> 5;
|
||||
default -> 6;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public boolean isHeadline() {
|
||||
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
||||
|
||||
@ -8,6 +8,7 @@ import java.util.regex.Pattern;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@AllArgsConstructor
|
||||
@ -16,13 +17,15 @@ public class SectionIdentifier {
|
||||
|
||||
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
|
||||
private enum Format {
|
||||
public enum Format {
|
||||
EMPTY,
|
||||
NUMERICAL,
|
||||
DOCUMENT
|
||||
}
|
||||
|
||||
@Getter
|
||||
Format format;
|
||||
@Getter
|
||||
String identifierString;
|
||||
List<Integer> identifiers;
|
||||
boolean asChild;
|
||||
|
||||
@ -140,8 +140,8 @@ public class DocumentTree {
|
||||
if (treeId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
Entry entry = root.children.get(treeId.get(0));
|
||||
for (int id : treeId.subList(1, treeId.size())) {
|
||||
Entry entry = root;
|
||||
for (int id : treeId) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
|
||||
@ -0,0 +1,74 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public abstract class AbstractSemanticNode implements GenericSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
List<Integer> treeId;
|
||||
|
||||
TextBlock textBlock;
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId.toString() + ": " + getType() + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
@ -3,43 +3,35 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Document implements GenericSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
public class Document extends AbstractSemanticNode {
|
||||
|
||||
Set<Page> pages;
|
||||
DocumentTree documentTree;
|
||||
Integer numberOfPages;
|
||||
TextBlock textBlock;
|
||||
@Builder.Default
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
LayoutparsingVisualizations visualizations;
|
||||
|
||||
@ -51,15 +43,6 @@ public class Document implements GenericSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
public List<Section> getMainSections() {
|
||||
|
||||
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
|
||||
@ -81,6 +64,15 @@ public class Document implements GenericSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
|
||||
.findFirst()
|
||||
.orElse(Headline.builder().build());
|
||||
}
|
||||
|
||||
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isLeaf)
|
||||
@ -102,18 +94,9 @@ public class Document implements GenericSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
|
||||
.findFirst()
|
||||
.orElse(Headline.builder().build());
|
||||
}
|
||||
|
||||
|
||||
private Stream<SemanticNode> streamAllNodes() {
|
||||
|
||||
return documentTree.allEntriesInOrder()
|
||||
return getDocumentTree().allEntriesInOrder()
|
||||
.map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@ -20,7 +20,8 @@ public class DuplicatedParagraph extends Paragraph {
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
|
||||
return Stream.of(super.getLeafTextBlock(), unsortedLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,48 +1,24 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Footer implements GenericSemanticNode {
|
||||
public class Footer extends AbstractSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
@ -68,17 +44,7 @@ public class Footer implements GenericSemanticNode {
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,47 +1,24 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Header implements GenericSemanticNode {
|
||||
public class Header extends AbstractSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
@ -67,17 +44,7 @@ public class Header implements GenericSemanticNode {
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,47 +1,24 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Headline implements GenericSemanticNode {
|
||||
public class Headline extends AbstractSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
@ -67,7 +44,7 @@ public class Headline implements GenericSemanticNode {
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
|
||||
return getTreeId() + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@ -77,14 +54,4 @@ public class Headline implements GenericSemanticNode {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -3,15 +3,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -21,18 +16,16 @@ import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Image implements GenericSemanticNode {
|
||||
public class Image extends AbstractSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
|
||||
List<Integer> treeId;
|
||||
String id;
|
||||
|
||||
ImageType imageType;
|
||||
@ -53,13 +46,6 @@ public class Image implements GenericSemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Page page;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
@ -85,7 +71,7 @@ public class Image implements GenericSemanticNode {
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
|
||||
return getTreeId() + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,20 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
@ -23,25 +13,12 @@ import lombok.experimental.SuperBuilder;
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@FieldDefaults(level = AccessLevel.PROTECTED)
|
||||
public class Paragraph implements GenericSemanticNode {
|
||||
public class Paragraph extends AbstractSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
@ -63,21 +40,4 @@ public class Paragraph implements GenericSemanticNode {
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,47 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Slf4j
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Section implements GenericSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
List<Integer> treeId;
|
||||
|
||||
TextBlock textBlock;
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class Section extends AbstractSemanticNode {
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
@ -50,6 +23,14 @@ public class Section implements GenericSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
|
||||
.findFirst()
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
public boolean hasTables() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
|
||||
@ -57,39 +38,10 @@ public class Section implements GenericSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE)//
|
||||
.map(node -> (Headline) node)//
|
||||
.findFirst()//
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
return getTreeId() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,40 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class SuperSection extends AbstractSemanticNode {
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.SUPER_SECTION;
|
||||
}
|
||||
|
||||
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
|
||||
.findFirst()
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getTreeId() + ": " + NodeType.SUPER_SECTION + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,34 +2,26 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableCell implements GenericSemanticNode {
|
||||
public class TableCell extends AbstractSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
List<Integer> treeId;
|
||||
int row;
|
||||
int col;
|
||||
boolean header;
|
||||
@ -40,13 +32,6 @@ public class TableCell implements GenericSemanticNode {
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
@ -96,7 +81,7 @@ public class TableCell implements GenericSemanticNode {
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
|
||||
return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -50,14 +50,16 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
||||
|
||||
int start = textBlock.getBoundary().start();
|
||||
int end = textBlock.getBoundary().end();
|
||||
if (this.atomicTextBlocks.isEmpty()) {
|
||||
boundary.setStart(textBlock.getBoundary().start());
|
||||
boundary.setEnd(textBlock.getBoundary().end());
|
||||
} else if (boundary.end() != textBlock.getBoundary().start()) {
|
||||
boundary.setStart(start);
|
||||
boundary.setEnd(end);
|
||||
} else if (boundary.end() != start) {
|
||||
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
|
||||
}
|
||||
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
||||
boundary.setEnd(textBlock.getBoundary().end());
|
||||
boundary.setEnd(end);
|
||||
this.searchText = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -0,0 +1,209 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.cos.COSArray;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitHeightDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitRectangleDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitWidthDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class OutlineExtractorService {
|
||||
|
||||
private static final String PDDESTINATION_TYPE_FIT = "Fit";
|
||||
private static final String PDDESTINATION_TYPE_FIT_B = "FitB";
|
||||
private static final String PDDESTINATION_TYPE_FIT_H = "FitH";
|
||||
private static final String PDDESTINATION_TYPE_FIT_V = "FitV";
|
||||
private static final String PDDESTINATION_TYPE_FIT_R = "FitR";
|
||||
private static final String PDDESTINATION_TYPE_FIT_BH = "FitBH";
|
||||
private static final String PDDESTINATION_TYPE_FIT_BV = "FitBV";
|
||||
private static final String PDDESTINATION_TYPE_XYZ = "XYZ";
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
|
||||
|
||||
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
|
||||
|
||||
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
if (documentOutline != null) {
|
||||
for (PDOutlineItem child : documentOutline.children()) {
|
||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
|
||||
outlineObjectWithChildren.ifPresent(rootNodes::add);
|
||||
}
|
||||
}
|
||||
|
||||
return new OutlineObjectTree(rootNodes);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
|
||||
|
||||
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
|
||||
if (outlineObject.isPresent()) {
|
||||
for (var child : item.children()) {
|
||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
|
||||
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
|
||||
}
|
||||
}
|
||||
|
||||
return outlineObject;
|
||||
}
|
||||
|
||||
|
||||
// if the structure elements are processed beforehand, another case can be handled here as well:
|
||||
// outline objects can reference structure elements (see pdf documentation)
|
||||
@SneakyThrows
|
||||
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
|
||||
|
||||
String title = item.getTitle();
|
||||
|
||||
PDPage page = item.findDestinationPage(document);
|
||||
if (page == null) {
|
||||
return Optional.empty();
|
||||
}
|
||||
int pageNumber = document.getPages().indexOf(page);
|
||||
|
||||
Optional<Point2D> outlinePosition = Optional.empty();
|
||||
|
||||
try {
|
||||
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
|
||||
PDDestinationNameTreeNode destinations = null;
|
||||
if (names != null) {
|
||||
destinations = names.getDests();
|
||||
}
|
||||
|
||||
PDDestination destination = item.getDestination();
|
||||
if (destination != null) {
|
||||
outlinePosition = getLocationFromCOSBase(destinations, destination.getCOSObject());
|
||||
}
|
||||
|
||||
if (outlinePosition.isEmpty()) {
|
||||
|
||||
PDAction action = item.getAction();
|
||||
if (action != null) {
|
||||
outlinePosition = extractOutlineLocationGoTo(destinations, action.getCOSObject());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
|
||||
}
|
||||
|
||||
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth)));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static Optional<Point2D> extractOutlineLocationGoTo(PDDestinationNameTreeNode destinations, COSDictionary cosDictionary) {
|
||||
|
||||
if (isGoToAction(cosDictionary)) {
|
||||
COSBase cosBase = cosDictionary.getItem(COSName.D);
|
||||
return getLocationFromCOSBase(destinations, cosBase);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static Optional<Point2D> getLocationFromCOSBase(PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
|
||||
|
||||
if (cosBase != null) {
|
||||
if (cosBase instanceof COSArray cosArray) {
|
||||
return getLocationFromCosArray(cosArray);
|
||||
}
|
||||
|
||||
if (cosBase instanceof COSString cosString) {
|
||||
String destinationName = cosString.getString();
|
||||
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
|
||||
return getLocationFromCosArray(cosArray);
|
||||
}
|
||||
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static Optional<Point2D> getLocationFromCosArray(COSArray cosArray) {
|
||||
|
||||
boolean located = false;
|
||||
float x = 0;
|
||||
float y = 0;
|
||||
|
||||
try {
|
||||
|
||||
PDDestination destination = PDDestination.create(cosArray);
|
||||
COSName type = (COSName) cosArray.getObject(1);
|
||||
String typeString = type.getName();
|
||||
|
||||
switch (typeString) {
|
||||
case PDDESTINATION_TYPE_FIT_V:
|
||||
case PDDESTINATION_TYPE_FIT_BV:
|
||||
PDPageFitHeightDestination fitHeightDestination = (PDPageFitHeightDestination) destination;
|
||||
x = fitHeightDestination.getLeft();
|
||||
located = true;
|
||||
break;
|
||||
case PDDESTINATION_TYPE_FIT_R:
|
||||
PDPageFitRectangleDestination fitRectangleDestination = (PDPageFitRectangleDestination) destination;
|
||||
x = fitRectangleDestination.getLeft();
|
||||
y = fitRectangleDestination.getTop();
|
||||
located = true;
|
||||
break;
|
||||
case PDDESTINATION_TYPE_FIT_H:
|
||||
case PDDESTINATION_TYPE_FIT_BH:
|
||||
PDPageFitWidthDestination fitWidthDestination = (PDPageFitWidthDestination) destination;
|
||||
y = fitWidthDestination.getTop();
|
||||
located = true;
|
||||
break;
|
||||
case PDDESTINATION_TYPE_XYZ:
|
||||
PDPageXYZDestination xyzDestination = (PDPageXYZDestination) destination;
|
||||
x = xyzDestination.getLeft();
|
||||
y = xyzDestination.getTop();
|
||||
located = true;
|
||||
break;
|
||||
case PDDESTINATION_TYPE_FIT:
|
||||
case PDDESTINATION_TYPE_FIT_B:
|
||||
default:
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return located ? Optional.of(new Point2D.Float(x, y)) : Optional.empty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static boolean isGoToAction(COSDictionary cosDictionary) {
|
||||
|
||||
return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto");
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,35 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class OutlineObject {
|
||||
|
||||
private final String title;
|
||||
private final int pageNumber;
|
||||
private Point2D point;
|
||||
private final int treeDepth;
|
||||
|
||||
private boolean found;
|
||||
|
||||
|
||||
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
||||
|
||||
this(title, pageNumber, depth);
|
||||
this.point = point2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "OutlineObject{" + "title='" + title + '\'' + '}';
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,42 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class OutlineObjectTree {
|
||||
|
||||
private List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
|
||||
private Map<Integer, List<OutlineObject>> outlineObjectsPerPage = new HashMap<>();
|
||||
|
||||
|
||||
public OutlineObjectTree(List<OutlineObjectTreeNode> rootNodes) {
|
||||
|
||||
this.rootNodes = rootNodes;
|
||||
flattenNodesAndGroupByPage(rootNodes);
|
||||
}
|
||||
|
||||
|
||||
private void flattenNodesAndGroupByPage(List<OutlineObjectTreeNode> outlineObjectTreeNodes) {
|
||||
|
||||
for (OutlineObjectTreeNode node : outlineObjectTreeNodes) {
|
||||
int pageNumber = node.getOutlineObject().getPageNumber();
|
||||
if (!this.outlineObjectsPerPage.containsKey(pageNumber)) {
|
||||
outlineObjectsPerPage.put(pageNumber, new ArrayList<>());
|
||||
}
|
||||
outlineObjectsPerPage.get(pageNumber).add(node.getOutlineObject());
|
||||
|
||||
if (!node.getChildren().isEmpty()) {
|
||||
flattenNodesAndGroupByPage(node.getChildren());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,34 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class OutlineObjectTreeNode {
|
||||
|
||||
private OutlineObject outlineObject;
|
||||
|
||||
private List<OutlineObjectTreeNode> children = new ArrayList<>();
|
||||
|
||||
|
||||
public OutlineObjectTreeNode(OutlineObject outlineObject) {
|
||||
|
||||
this.outlineObject = outlineObject;
|
||||
}
|
||||
|
||||
|
||||
public void addChild(OutlineObjectTreeNode outlineObject) {
|
||||
|
||||
children.add(outlineObject);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "OutlineObjectTreeNode{" + "outlineObject=" + outlineObject + '}';
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,61 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class OutlineValidationService {
|
||||
|
||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
||||
|
||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
|
||||
TableOfContentItem last = null;
|
||||
TreeSet<Integer> depths = new TreeSet<>();
|
||||
|
||||
for (TextPageBlock current : headlines) {
|
||||
int currentDepth = getHeadlineNumber(current.getClassification());
|
||||
Integer parentDepth = depths.floor(currentDepth - 1);
|
||||
|
||||
var tocItem = new TableOfContentItem(current);
|
||||
|
||||
if (parentDepth == null) {
|
||||
mainSections.add(tocItem);
|
||||
lastItemsPerDepth = new HashMap<>();
|
||||
depths = new TreeSet<>();
|
||||
|
||||
} else {
|
||||
assert last != null;
|
||||
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
|
||||
|
||||
if (lastDepth < parentDepth) {
|
||||
parentDepth = lastDepth;
|
||||
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
||||
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
|
||||
}
|
||||
|
||||
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
|
||||
parent.addChild(tocItem);
|
||||
}
|
||||
|
||||
last = tocItem;
|
||||
lastItemsPerDepth.put(currentDepth, tocItem);
|
||||
depths.add(currentDepth);
|
||||
}
|
||||
|
||||
return new TableOfContents(mainSections);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,261 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class TOCEnrichmentService {
|
||||
|
||||
public void assignSectionBlocksAndImages(ClassificationDocument document) {
|
||||
|
||||
TableOfContents toc = document.getTableOfContents();
|
||||
Iterator<TableOfContentItem> iterator = toc.iterator();
|
||||
TableOfContentItem currentTOCItem = null;
|
||||
if(iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
||||
List<ClassifiedImage> startImages = new ArrayList<>();
|
||||
TableOfContentItem currentSection = null;
|
||||
boolean foundFirstHeadline = false;
|
||||
|
||||
List<ClassificationHeader> headers = new ArrayList<>();
|
||||
List<ClassificationFooter> footers = new ArrayList<>();
|
||||
TablePageBlock previousTable = null;
|
||||
List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
List<TableOfContentItem> currentPageTOCItems = new ArrayList<>();
|
||||
List<TextPageBlock> header = new ArrayList<>();
|
||||
List<TextPageBlock> footer = new ArrayList<>();
|
||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||
|
||||
if (current.getClassification() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
current.setPage(page.getPageNumber());
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.HEADER)) {
|
||||
header.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.FOOTER)) {
|
||||
footer.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current instanceof TablePageBlock table) {
|
||||
if (previousTable != null) {
|
||||
mergeTableMetadata(table, previousTable);
|
||||
}
|
||||
previousTable = table;
|
||||
}
|
||||
|
||||
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
|
||||
if (!foundFirstHeadline) {
|
||||
foundFirstHeadline = true;
|
||||
}
|
||||
currentSection = currentTOCItem;
|
||||
currentTOCItem.getSectionBlocks().add(current);
|
||||
currentPageTOCItems.add(currentTOCItem);
|
||||
|
||||
if(iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundFirstHeadline) {
|
||||
startBlocks.add(current);
|
||||
} else {
|
||||
currentSection.getSectionBlocks().add(current);
|
||||
}
|
||||
}
|
||||
|
||||
if (!currentPageTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems = currentPageTOCItems;
|
||||
}
|
||||
|
||||
for (ClassifiedImage image : page.getImages()) {
|
||||
|
||||
Double xMin = null;
|
||||
Double yMin = null;
|
||||
Double xMax = null;
|
||||
Double yMax = null;
|
||||
|
||||
for (TableOfContentItem tocItem : lastFoundTOCItems) {
|
||||
var headline = tocItem.getHeadline();
|
||||
|
||||
if (headline.getPage() != page.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (headline.getMinX() < headline.getMaxX()) {
|
||||
if (xMin == null || headline.getMinX() < xMin) {
|
||||
xMin = headline.getMinX();
|
||||
}
|
||||
if (xMax == null || headline.getMaxX() > xMax) {
|
||||
xMax = headline.getMaxX();
|
||||
}
|
||||
} else {
|
||||
if (xMin == null || headline.getMaxX() < xMin) {
|
||||
xMin = headline.getMaxX();
|
||||
}
|
||||
if (xMax == null || headline.getMinX() > xMax) {
|
||||
xMax = headline.getMinX();
|
||||
}
|
||||
}
|
||||
|
||||
if (headline.getMinY() < headline.getMaxY()) {
|
||||
if (yMin == null || headline.getMinY() < yMin) {
|
||||
yMin = headline.getMinY();
|
||||
}
|
||||
if (yMax == null || headline.getMaxY() > yMax) {
|
||||
yMax = headline.getMaxY();
|
||||
}
|
||||
} else {
|
||||
if (yMin == null || headline.getMaxY() < yMin) {
|
||||
yMin = headline.getMaxY();
|
||||
}
|
||||
if (yMax == null || headline.getMinY() > yMax) {
|
||||
yMax = headline.getMinY();
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
tocItem.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!image.isAppendedToSection()) {
|
||||
log.debug("Image uses first paragraph");
|
||||
if (!lastFoundTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems.get(0).getImages().add(image);
|
||||
} else {
|
||||
startImages.add(image);
|
||||
}
|
||||
image.setAppendedToSection(true);
|
||||
}
|
||||
}
|
||||
|
||||
if (!header.isEmpty()) {
|
||||
headers.add(new ClassificationHeader(header));
|
||||
}
|
||||
if (!footer.isEmpty()) {
|
||||
footers.add(new ClassificationFooter(footer));
|
||||
}
|
||||
}
|
||||
|
||||
if (!startBlocks.isEmpty()) {
|
||||
TableOfContentItem unassigned = new TableOfContentItem(null);
|
||||
unassigned.setSectionBlocks(startBlocks);
|
||||
unassigned.setImages(startImages);
|
||||
document.getTableOfContents().getMainSections().add(0, unassigned);
|
||||
}
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty()
|
||||
&& previousTable.getRowCount() == 1
|
||||
&& previousTable.getRows()
|
||||
.get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = Cell.copy(cell);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
})
|
||||
.toList();
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows()
|
||||
.get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean hasValidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return !hasInvalidHeaderInformation(table);
|
||||
}
|
||||
|
||||
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(row -> row.stream()
|
||||
.filter(cell -> !cell.getHeaderCells().isEmpty()))
|
||||
.findAny().isEmpty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows()
|
||||
.get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
boolean allNonHeader = true;
|
||||
for (Cell cell : row) {
|
||||
if (cell.isHeaderCell()) {
|
||||
allNonHeader = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allNonHeader) {
|
||||
return row;
|
||||
}
|
||||
}
|
||||
|
||||
return Collections.emptyList();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,109 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class TableOfContentItem {
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
private TextPageBlock headline;
|
||||
private List<TableOfContentItem> children = new ArrayList<>();
|
||||
private TableOfContentItem parent;
|
||||
|
||||
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private AbstractSemanticNode section;
|
||||
|
||||
|
||||
public TableOfContentItem(TextPageBlock headline) {
|
||||
|
||||
this.headline = headline;
|
||||
}
|
||||
|
||||
|
||||
public void addChild(TableOfContentItem tableOfContentItem) {
|
||||
|
||||
children.add(tableOfContentItem);
|
||||
tableOfContentItem.setParent(this);
|
||||
}
|
||||
|
||||
|
||||
public TableOfContentItem getSiblingBefore() {
|
||||
|
||||
if (parent != null) {
|
||||
int index = parent.getChildren().indexOf(this);
|
||||
if (index > 0) {
|
||||
return parent.getChildren()
|
||||
.get(index - 1);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public TableOfContentItem getSiblingAfter() {
|
||||
|
||||
if (parent != null) {
|
||||
int index = parent.getChildren().indexOf(this);
|
||||
if (index >= 0 && index < parent.getChildren().size() - 1) {
|
||||
return parent.getChildren()
|
||||
.get(index + 1);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(TextPageBlock block) {
|
||||
|
||||
if (headline.equals(block)) {
|
||||
return true;
|
||||
}
|
||||
for (TableOfContentItem child : children) {
|
||||
if (child.contains(block)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(TableOfContentItem tocItem) {
|
||||
|
||||
if (this.equals(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
for (TableOfContentItem child : children) {
|
||||
if (child.contains(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public List<AbstractPageBlock> getNonEmptySectionBlocks() {
|
||||
|
||||
return sectionBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}';
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,136 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Stack;
|
||||
|
||||
import org.springframework.lang.NonNull;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class TableOfContents implements Iterable<TableOfContentItem> {
|
||||
|
||||
private List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
|
||||
|
||||
public TableOfContents(List<TableOfContentItem> mainSections) {
|
||||
|
||||
this.mainSections = mainSections;
|
||||
}
|
||||
|
||||
|
||||
public List<TextPageBlock> getAllTextPageBlocks() {
|
||||
|
||||
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
|
||||
for (TableOfContentItem item : mainSections) {
|
||||
collectTextPageBlocks(item, allTextPageBlocks);
|
||||
}
|
||||
return allTextPageBlocks;
|
||||
}
|
||||
|
||||
|
||||
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
|
||||
|
||||
textPageBlocks.add(item.getHeadline());
|
||||
for (TableOfContentItem child : item.getChildren()) {
|
||||
collectTextPageBlocks(child, textPageBlocks);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<TableOfContentItem> getAllTableOfContentItems() {
|
||||
|
||||
List<TableOfContentItem> allItems = new ArrayList<>();
|
||||
for (TableOfContentItem item : mainSections) {
|
||||
collectTableOfContentItems(item, allItems);
|
||||
}
|
||||
return allItems;
|
||||
}
|
||||
|
||||
|
||||
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
|
||||
|
||||
allItems.add(item);
|
||||
for (TableOfContentItem child : item.getChildren()) {
|
||||
collectTableOfContentItems(child, allItems);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean containsBlock(TextPageBlock block) {
|
||||
|
||||
for (TableOfContentItem existingItem : this.getMainSections()) {
|
||||
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean containsItem(TableOfContentItem tocItem) {
|
||||
|
||||
for (TableOfContentItem existingItem : this.getMainSections()) {
|
||||
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public @NonNull Iterator<TableOfContentItem> iterator() {
|
||||
|
||||
return new TableOfContentItemIterator(mainSections);
|
||||
}
|
||||
|
||||
|
||||
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
|
||||
|
||||
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
|
||||
|
||||
|
||||
TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
|
||||
|
||||
stack.push(mainSections.iterator());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
|
||||
ensureStackTopIsCurrent();
|
||||
return !stack.isEmpty() && stack.peek().hasNext();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TableOfContentItem next() {
|
||||
|
||||
ensureStackTopIsCurrent();
|
||||
TableOfContentItem currentItem = stack.peek().next();
|
||||
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
|
||||
stack.push(currentItem.getChildren()
|
||||
.iterator());
|
||||
}
|
||||
return currentItem;
|
||||
}
|
||||
|
||||
|
||||
private void ensureStackTopIsCurrent() {
|
||||
|
||||
while (!stack.isEmpty() && !stack.peek().hasNext()) {
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -58,6 +58,20 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getPageHeight() {
|
||||
|
||||
return sequences.get(0).getPageHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getPageWidth() {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
private void calculateBBox() {
|
||||
|
||||
if (sequences == null) {
|
||||
@ -69,6 +83,12 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public void recalculateBBox() {
|
||||
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
if (textBlocksToMerge.isEmpty()) {
|
||||
|
||||
@ -27,8 +27,10 @@ import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@Deprecated
|
||||
public class SectionsBuilderService {
|
||||
|
||||
|
||||
public void buildSections(ClassificationDocument document) {
|
||||
|
||||
List<AbstractPageBlock> chunkWords = new ArrayList<>();
|
||||
|
||||
@ -0,0 +1,525 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Locale;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Service
|
||||
public class BlockificationPostprocessingService {
|
||||
|
||||
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
|
||||
|
||||
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
|
||||
.stream()
|
||||
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
||||
.stream()
|
||||
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
|
||||
.collect(RectangleTransformations.collectBBox()))
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
|
||||
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||
|
||||
if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
float pageHeight = classificationPage.getPageHeight();
|
||||
|
||||
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
|
||||
|
||||
if (notFoundOutlineObject != null) {
|
||||
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
|
||||
|
||||
OutlineObject firstOutlineObject = null;
|
||||
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
|
||||
if (outlineObjectListIterator.hasNext()) {
|
||||
firstOutlineObject = outlineObjectListIterator.next();
|
||||
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||
}
|
||||
|
||||
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
|
||||
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
|
||||
}
|
||||
if (firstOutlineObject != null) {
|
||||
// re-create the context for the updated blocks
|
||||
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
outlineObjectListIterator.forEachRemaining(outlineObject -> {
|
||||
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
|
||||
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
|
||||
});
|
||||
|
||||
if (!outlineObjects.isEmpty()) {
|
||||
return outlineObjects.get(outlineObjects.size() - 1);
|
||||
} else {
|
||||
return notFoundOutlineObject;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPageBlock> getTextPageBlocks(ClassificationPage classificationPage) {
|
||||
|
||||
return classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(block -> block instanceof TextPageBlock)
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) {
|
||||
|
||||
if (firstOutlineObjectProcessionContext == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||
String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||
|
||||
if (!firstTitle.startsWith(notFoundTitle)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext);
|
||||
var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext);
|
||||
|
||||
double maxYFirst = blocksOfFirstOutline.stream()
|
||||
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||
.max()
|
||||
.orElse(Double.NEGATIVE_INFINITY);
|
||||
|
||||
return blocksOfNotFoundOutline.stream()
|
||||
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||
.anyMatch(y -> y >= maxYFirst);
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> getAllMatchingBlocks(OutlineProcessionContext context) {
|
||||
|
||||
List<TextPageBlock> blocks = new ArrayList<>();
|
||||
if (context.getDirectMatch() != null) {
|
||||
blocks.add(context.getDirectMatch());
|
||||
}
|
||||
if (context.getSplitCandidate() != null) {
|
||||
blocks.add(context.getSplitCandidate());
|
||||
}
|
||||
blocks.addAll(context.getMergeCandidates());
|
||||
return blocks;
|
||||
}
|
||||
|
||||
|
||||
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.getOutlineObject();
|
||||
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
||||
while (iterator.hasNext()) {
|
||||
TextPageBlock pageBlock = iterator.next();
|
||||
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (iterator.hasPrevious()) {
|
||||
iterator.previous();
|
||||
}
|
||||
boolean earlyStop = false;
|
||||
while (iterator.hasNext() && !earlyStop) {
|
||||
TextPageBlock pageBlock = iterator.next();
|
||||
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.outlineObject;
|
||||
TextPageBlock directMatch = context.directMatch;
|
||||
List<TextPageBlock> mergeCandidates = context.mergeCandidates;
|
||||
TextPageBlock splitCandidate = context.splitCandidate;
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
|
||||
|
||||
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
|
||||
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
|
||||
|
||||
double distanceToBestMergeCandidates = Double.MAX_VALUE;
|
||||
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
|
||||
if (!mergeCandidates.isEmpty()) {
|
||||
|
||||
// with this code adjacent blocks to the first and last merge candidate get added, this could be useful for some edge cases:
|
||||
//List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
|
||||
//addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
|
||||
//if (mergeCandidates.size() > 1) {
|
||||
// addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
|
||||
//}
|
||||
//allMergeCandidates = allMergeCandidates.stream()
|
||||
// .distinct()
|
||||
// .toList();
|
||||
|
||||
List<List<TextPageBlock>> combinations = findCombinations(outlineObject.getTitle(), mergeCandidates);
|
||||
|
||||
for (List<TextPageBlock> combination : combinations) {
|
||||
double averageDistance = combination.stream()
|
||||
.map(block -> calculateDistance(outlineObject, block))
|
||||
.mapToDouble(Double::doubleValue).average()
|
||||
.orElse(Double.MAX_VALUE);
|
||||
if (distanceToBestMergeCandidates > averageDistance) {
|
||||
distanceToBestMergeCandidates = averageDistance;
|
||||
bestMergeCandidateCombination = combination;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
|
||||
|
||||
if (minDistance == Double.MAX_VALUE) {
|
||||
return false;
|
||||
}
|
||||
if (minDistance == distanceToDirectMatch) {
|
||||
directMatch.setClassification(headlineType);
|
||||
} else if (minDistance == distanceToSplitCandidate) {
|
||||
SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
|
||||
if (splitBlockResult.modifiedBlockToSplit) {
|
||||
splitCandidate.setClassification(headlineType);
|
||||
}
|
||||
splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null));
|
||||
} else {
|
||||
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
||||
merged.setClassification(headlineType);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
|
||||
|
||||
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
||||
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
||||
|
||||
String headline = title;
|
||||
if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||
headline = sectionIdentifier + headline;
|
||||
}
|
||||
|
||||
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
|
||||
if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) {
|
||||
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
|
||||
}
|
||||
|
||||
boolean modifiedBlockToSplit = false;
|
||||
if (!wordSequenceResult.inSequence.isEmpty()) {
|
||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||
blockToSplit.recalculateBBox();
|
||||
modifiedBlockToSplit = true;
|
||||
}
|
||||
|
||||
if (!wordSequenceResult.preSequence.isEmpty()) {
|
||||
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
||||
classificationPage.getTextBlocks().add(blockToSplitIdx, block);
|
||||
otherBlocks.add(block);
|
||||
blockToSplitIdx++;
|
||||
}
|
||||
if (!wordSequenceResult.postSequence.isEmpty()) {
|
||||
TextPageBlock block = buildTextBlock(wordSequenceResult.postSequence, 0);
|
||||
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
|
||||
otherBlocks.add(block);
|
||||
}
|
||||
|
||||
return new SplitBlockResult(modifiedBlockToSplit, otherBlocks);
|
||||
}
|
||||
|
||||
|
||||
private static WordSequenceResult findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
|
||||
|
||||
String target = sanitizeString(text);
|
||||
List<TextPositionSequence> inSequence = new ArrayList<>();
|
||||
List<TextPositionSequence> preSequence = new ArrayList<>();
|
||||
List<TextPositionSequence> postSequence = new ArrayList<>();
|
||||
StringBuilder currentSequence = new StringBuilder();
|
||||
|
||||
for (TextPositionSequence sequence : textPositionSequences) {
|
||||
|
||||
currentSequence.append(sanitizeString(sequence.toString()));
|
||||
inSequence.add(sequence);
|
||||
|
||||
if (currentSequence.length() >= target.length()) {
|
||||
|
||||
if (currentSequence.toString().endsWith(target)) {
|
||||
|
||||
int index = 0;
|
||||
String toRemove = currentSequence.substring(0, currentSequence.length() - target.length());
|
||||
|
||||
TextPositionSequence next = inSequence.get(index);
|
||||
while (currentSequence.length() - next.length() >= target.length()) {
|
||||
|
||||
TextPositionSequence removed = inSequence.remove(index);
|
||||
currentSequence.delete(0, removed.toString().length());
|
||||
preSequence.add(removed);
|
||||
|
||||
next = inSequence.get(index);
|
||||
toRemove = toRemove.substring(removed.length());
|
||||
}
|
||||
|
||||
if (!toRemove.isEmpty()) {
|
||||
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
|
||||
|
||||
currentSequence.delete(0, splitSequenceResult.out.length());
|
||||
preSequence.add(splitSequenceResult.out);
|
||||
inSequence.add(index, splitSequenceResult.in);
|
||||
}
|
||||
|
||||
} else if (currentSequence.toString().startsWith(target)) {
|
||||
|
||||
int index = inSequence.size() - 1;
|
||||
String toRemove = currentSequence.substring(target.length());
|
||||
|
||||
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
|
||||
currentSequence.delete(currentSequence.length() - splitSequenceResult.out.length(), currentSequence.length());
|
||||
|
||||
inSequence.add(index, splitSequenceResult.in);
|
||||
postSequence.add(splitSequenceResult.out);
|
||||
}
|
||||
|
||||
if (currentSequence.toString().equals(target)) {
|
||||
postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size()));
|
||||
return new WordSequenceResult(inSequence, preSequence, postSequence);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new WordSequenceResult();
|
||||
}
|
||||
|
||||
|
||||
private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) {
|
||||
|
||||
TextPositionSequence in = null;
|
||||
TextPositionSequence out;
|
||||
|
||||
String currentSequence = sequence.toString();
|
||||
int index = currentSequence.indexOf(toRemove);
|
||||
int endIndex = index + toRemove.length();
|
||||
|
||||
out = createSubSequence(sequence, index, endIndex);
|
||||
|
||||
if (index > 0) {
|
||||
in = createSubSequence(sequence, 0, index);
|
||||
} else if (endIndex < sequence.getTextPositions().size()) {
|
||||
in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size());
|
||||
}
|
||||
|
||||
return new SplitSequenceResult(in, out);
|
||||
}
|
||||
|
||||
|
||||
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
|
||||
|
||||
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
|
||||
newSeq.setParagraphStart(sequence.isParagraphStart());
|
||||
return newSeq;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
|
||||
|
||||
TextPageBlock firstBlock = blocksToMerge.get(0);
|
||||
|
||||
if (blocksToMerge.size() > 1) {
|
||||
|
||||
List<TextPageBlock> mergedBlocks = new ArrayList<>();
|
||||
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
|
||||
|
||||
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
|
||||
|
||||
if (textPageBlock.getDir() == firstBlock.getDir()) {
|
||||
firstBlock.getSequences().addAll(textPageBlock.getSequences());
|
||||
mergedBlocks.add(textPageBlock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert firstBlock != null;
|
||||
firstBlock.setToDuplicate(false);
|
||||
firstBlock.recalculateBBox();
|
||||
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||
}
|
||||
|
||||
return firstBlock;
|
||||
}
|
||||
|
||||
|
||||
private static List<List<TextPageBlock>> findCombinations(String title, List<TextPageBlock> blocks) {
|
||||
|
||||
List<List<TextPageBlock>> combinations = new ArrayList<>();
|
||||
findCombinations(title, blocks, new ArrayList<>(), combinations);
|
||||
return combinations;
|
||||
}
|
||||
|
||||
|
||||
private static void findCombinations(String title, List<TextPageBlock> blocks, List<TextPageBlock> current, List<List<TextPageBlock>> combinations) {
|
||||
|
||||
String target = sanitizeString(title);
|
||||
if (target.isEmpty()) {
|
||||
combinations.add(new ArrayList<>(current));
|
||||
return;
|
||||
}
|
||||
|
||||
List<TextPageBlock> remaining = blocks.stream()
|
||||
.filter(block -> !current.contains(block))
|
||||
.toList();
|
||||
for (TextPageBlock block : remaining) {
|
||||
String prefix = sanitizeString(block.getText());
|
||||
if (target.startsWith(prefix)) {
|
||||
current.add(block);
|
||||
findCombinations(target.substring(prefix.length()), blocks.subList(blocks.indexOf(block) + 1, blocks.size()), current, combinations);
|
||||
current.remove(current.size() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
|
||||
|
||||
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
|
||||
double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY();
|
||||
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
|
||||
}
|
||||
|
||||
|
||||
// currently only three cases are handled here:
|
||||
// 1. equality
|
||||
// 2. outline title contains block text
|
||||
// 3. block text contains outline title
|
||||
// another possible case is an intersection, meaning a title is split up between two different blocks
|
||||
// this should not happen with how docstrum creates the blocks
|
||||
// if it is indeed necessary, a splitting has to be done with a follow-up merge
|
||||
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.getOutlineObject();
|
||||
String blockText = sanitizeString(pageBlock.getText());
|
||||
String outlineTitle = sanitizeString(outlineObject.getTitle());
|
||||
|
||||
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
||||
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
||||
|
||||
if (!blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (blockText.equals(outlineTitle) && context.directMatch == null) {
|
||||
context.directMatch = pageBlock;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (outlineTitleContainsBlockText) {
|
||||
context.mergeCandidates.add(pageBlock);
|
||||
}
|
||||
|
||||
if (blockTextContainsOutlineTitle) {
|
||||
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
|
||||
|
||||
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||
|
||||
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
|
||||
context.directMatch = pageBlock;
|
||||
return true;
|
||||
} else if (context.splitCandidate == null) {
|
||||
context.sectionIdentifier = sectionIdentifier;
|
||||
}
|
||||
}
|
||||
if (context.splitCandidate == null) {
|
||||
context.splitCandidate = pageBlock;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private static String sanitizeString(String text) {
|
||||
|
||||
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
|
||||
@Data
|
||||
private static class OutlineProcessionContext {
|
||||
|
||||
private TextPageBlock directMatch;
|
||||
private OutlineObject outlineObject;
|
||||
private List<TextPageBlock> mergeCandidates;
|
||||
private TextPageBlock splitCandidate;
|
||||
private SectionIdentifier sectionIdentifier;
|
||||
|
||||
|
||||
OutlineProcessionContext(OutlineObject outlineObject) {
|
||||
|
||||
this.outlineObject = outlineObject;
|
||||
this.directMatch = null;
|
||||
this.mergeCandidates = new ArrayList<>();
|
||||
this.splitCandidate = null;
|
||||
this.sectionIdentifier = SectionIdentifier.empty();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class WordSequenceResult {
|
||||
|
||||
public List<TextPositionSequence> inSequence;
|
||||
public List<TextPositionSequence> preSequence;
|
||||
public List<TextPositionSequence> postSequence;
|
||||
|
||||
|
||||
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
|
||||
|
||||
this.inSequence = inSequence;
|
||||
this.preSequence = preSequence;
|
||||
this.postSequence = postSequence;
|
||||
}
|
||||
|
||||
|
||||
public WordSequenceResult() {
|
||||
|
||||
this.inSequence = new ArrayList<>();
|
||||
this.preSequence = new ArrayList<>();
|
||||
this.postSequence = new ArrayList<>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public record SplitBlockResult(boolean modifiedBlockToSplit, List<TextPageBlock> otherBlocks) {
|
||||
|
||||
}
|
||||
|
||||
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -126,6 +126,16 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.isHeadline() || previous.isHeadline()) {
|
||||
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, false);
|
||||
} else {
|
||||
previous = current;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||
continue;
|
||||
@ -172,6 +182,12 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
|
||||
return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
|
||||
}
|
||||
|
||||
|
||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
|
||||
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
||||
@ -185,6 +201,9 @@ public class DocstrumBlockificationService {
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
previous.setToDuplicate(toDuplicate);
|
||||
if (current.getClassification() != null && previous.getClassification() == null) {
|
||||
previous.setClassification(current.getClassification());
|
||||
}
|
||||
itty.remove();
|
||||
itty.previous();
|
||||
itty.set(previous);
|
||||
@ -244,21 +263,30 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (block.getClassification() != null && block.getClassification().isHeadline()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
|
||||
if (blocks.get(i) == null) {
|
||||
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
||||
if (abstractPageBlock == null) {
|
||||
continue;
|
||||
}
|
||||
if (blocks.get(i) == current) {
|
||||
if (abstractPageBlock == current) {
|
||||
continue;
|
||||
}
|
||||
if (blocks.get(i) instanceof TablePageBlock) {
|
||||
if (abstractPageBlock instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) blocks.get(i);
|
||||
if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
||||
|
||||
if (usedRulings.lineBetween(current, blocks.get(i))) {
|
||||
continue;
|
||||
@ -285,7 +313,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
return new TextPageBlock(wordBlockList);
|
||||
}
|
||||
|
||||
@ -21,12 +21,16 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class ClarifyndClassificationService {
|
||||
|
||||
private final HeadlineClassificationService headlineClassificationService;
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
headlineClassificationService.resetContext();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
@ -47,6 +51,10 @@ public class ClarifyndClassificationService {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
@ -79,7 +87,8 @@ public class ClarifyndClassificationService {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
@ -89,7 +98,8 @@ public class ClarifyndClassificationService {
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
|
||||
@ -24,6 +24,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class DocuMineClassificationService {
|
||||
|
||||
private final HeadlineClassificationService headlineClassificationService;
|
||||
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
@ -35,6 +36,8 @@ public class DocuMineClassificationService {
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
headlineClassificationService.resetContext();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
@ -60,6 +63,10 @@ public class DocuMineClassificationService {
|
||||
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
||||
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
||||
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
@ -94,6 +101,7 @@ public class DocuMineClassificationService {
|
||||
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
|
||||
.contains(":")
|
||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|
||||
@ -102,11 +110,13 @@ public class DocuMineClassificationService {
|
||||
|| textBlock.toString().startsWith("TABLE"))
|
||||
&& !textBlock.toString().endsWith(":")
|
||||
&& matcher2.reset().find()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
|
||||
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
|
||||
@ -0,0 +1,62 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
@Service
|
||||
@Getter
|
||||
@Setter
|
||||
public class HeadlineClassificationService {
|
||||
|
||||
TextPageBlock lastHeadline;
|
||||
PageBlockType originalClassifiedBlockType;
|
||||
TextPageBlock lastHeadlineFromOutline;
|
||||
|
||||
public void resetContext() {
|
||||
setLastHeadline(null);
|
||||
setOriginalClassifiedBlockType(null);
|
||||
setLastHeadlineFromOutline(null);
|
||||
}
|
||||
|
||||
|
||||
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
|
||||
|
||||
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
|
||||
this.setLastHeadline(lastHeadlineFromOutline);
|
||||
}
|
||||
|
||||
|
||||
public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) {
|
||||
|
||||
TextPageBlock lastHeadline = getLastHeadline();
|
||||
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
|
||||
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
|
||||
PageBlockType finalHeadlineType = initialHeadlineType;
|
||||
|
||||
if (lastHeadline != null) {
|
||||
|
||||
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
||||
|
||||
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
|
||||
|
||||
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
||||
|
||||
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
||||
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
||||
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference);
|
||||
}
|
||||
}
|
||||
|
||||
setOriginalClassifiedBlockType(initialHeadlineType);
|
||||
textBlock.setClassification(finalHeadlineType);
|
||||
setLastHeadline(textBlock);
|
||||
}
|
||||
|
||||
}
|
||||
@ -22,12 +22,17 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RedactManagerClassificationService {
|
||||
|
||||
private final HeadlineClassificationService headlineClassificationService;
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
headlineClassificationService.resetContext();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
@ -48,6 +53,10 @@ public class RedactManagerClassificationService {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
@ -60,58 +69,64 @@ public class RedactManagerClassificationService {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
||||
.getCountPerValue()
|
||||
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
|
||||
&& (textBlock.getMostPopularWordStyle().equals("bold")
|
||||
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
|
||||
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
|
||||
&& textBlock.getSequences()
|
||||
.get(0).getTextPositions()
|
||||
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
||||
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||
} else if (!textBlock.getText().startsWith("Figure ")
|
||||
&& PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||
&& textBlock.getSequences()
|
||||
.get(0).getTextPositions()
|
||||
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
|
||||
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
&& textBlock.getMostPopularWordStyle().equals("italic")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
|
||||
@ -11,6 +11,7 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ -30,9 +31,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
@ -57,11 +59,6 @@ public class DocumentGraphFactory {
|
||||
|
||||
document.getPages()
|
||||
.forEach(context::buildAndAddPageWithCounter);
|
||||
document.getSections()
|
||||
.stream()
|
||||
.flatMap(section -> section.getImages()
|
||||
.stream())
|
||||
.forEach(image -> context.getImages().add(image));
|
||||
addSections(layoutParsingType, document, context, documentGraph);
|
||||
addHeaderAndFooterToEachPage(document, context);
|
||||
|
||||
@ -75,8 +72,17 @@ public class DocumentGraphFactory {
|
||||
|
||||
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
||||
|
||||
classificationDocument.getSections()
|
||||
.forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
|
||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
||||
Optional<AbstractSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
|
||||
parent,
|
||||
tocItem.getChildren().isEmpty(),
|
||||
tocItem.getNonEmptySectionBlocks(),
|
||||
tocItem.getImages(),
|
||||
context,
|
||||
document);
|
||||
tocItem.setSection(section.orElse(null));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -181,10 +187,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
@ -236,7 +239,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
DocumentTree documentTree;
|
||||
Map<Page, Integer> pages;
|
||||
List<Section> sections;
|
||||
List<AbstractSemanticNode> sections;
|
||||
List<ClassifiedImage> images;
|
||||
TextBlockFactory textBlockFactory;
|
||||
|
||||
|
||||
@ -9,6 +9,7 @@ import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
@ -17,6 +18,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -27,12 +30,13 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class SectionNodeFactory {
|
||||
|
||||
public void addSection(LayoutParsingType layoutParsingType,
|
||||
GenericSemanticNode parentNode,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
List<ClassifiedImage> images,
|
||||
DocumentGraphFactory.Context context,
|
||||
Document document) {
|
||||
public Optional<AbstractSemanticNode> addSection(LayoutParsingType layoutParsingType,
|
||||
GenericSemanticNode parentNode,
|
||||
boolean isLeaf,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
List<ClassifiedImage> images,
|
||||
DocumentGraphFactory.Context context,
|
||||
Document document) {
|
||||
|
||||
// This is for the case where we have images on a page without any text/footer/header.
|
||||
// The pageBlocks list is empty, but we still need to add those images to the document.
|
||||
@ -40,16 +44,22 @@ public class SectionNodeFactory {
|
||||
images.stream()
|
||||
.distinct()
|
||||
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
|
||||
return;
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
if (pageBlocks.isEmpty()) {
|
||||
return;
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
|
||||
AbstractSemanticNode section;
|
||||
if (isLeaf) {
|
||||
section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
section = SuperSection.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
|
||||
context.getSections().add(section);
|
||||
blocksPerPage.keySet()
|
||||
@ -59,12 +69,24 @@ public class SectionNodeFactory {
|
||||
|
||||
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
|
||||
section,
|
||||
true,
|
||||
subSectionPageBlocks,
|
||||
emptyList(),
|
||||
context,
|
||||
document));
|
||||
} else if (!isLeaf) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
|
||||
} else {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||
}
|
||||
@ -72,10 +94,12 @@ public class SectionNodeFactory {
|
||||
images.stream()
|
||||
.distinct()
|
||||
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||
|
||||
return Optional.of(section);
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
|
||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, AbstractSemanticNode section) {
|
||||
|
||||
if (parentNode == null) {
|
||||
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
|
||||
@ -88,7 +112,7 @@ public class SectionNodeFactory {
|
||||
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
Section section,
|
||||
AbstractSemanticNode section,
|
||||
Document document) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
@ -101,7 +125,7 @@ public class SectionNodeFactory {
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
Section section,
|
||||
AbstractSemanticNode section,
|
||||
Document document) {
|
||||
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
@ -226,7 +250,7 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
|
||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, AbstractSemanticNode section, Integer pageNumber) {
|
||||
|
||||
Page page = context.getPage(pageNumber);
|
||||
page.getMainBody().add(section);
|
||||
|
||||
@ -154,10 +154,11 @@ public class TableNodeFactory {
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(layoutParsingType,
|
||||
tableCell,
|
||||
true,
|
||||
cell.getTextBlocks()
|
||||
.stream()
|
||||
.map(tb -> (AbstractPageBlock) tb)
|
||||
.toList(),
|
||||
.collect(Collectors.toList()),
|
||||
emptyList(),
|
||||
context,
|
||||
document);
|
||||
|
||||
@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
@ -62,6 +63,7 @@ public class DocumentGraphMapper {
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case SUPER_SECTION -> buildSuperSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
|
||||
case HEADLINE -> buildHeadline(context);
|
||||
case HEADER -> buildHeader(context);
|
||||
@ -109,7 +111,7 @@ public class DocumentGraphMapper {
|
||||
|
||||
private TableCell buildTableCell(Context context, Map<String, String> properties) {
|
||||
|
||||
TableCell.TableCellBuilder builder = TableCell.builder();
|
||||
TableCell.TableCellBuilder<?, ?> builder = TableCell.builder();
|
||||
PropertiesMapper.parseTableCellProperties(properties, builder);
|
||||
return builder.documentTree(context.documentTree).build();
|
||||
}
|
||||
@ -140,6 +142,11 @@ public class DocumentGraphMapper {
|
||||
return Section.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
private SuperSection buildSuperSection(Context context) {
|
||||
|
||||
return SuperSection.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
||||
|
||||
|
||||
@ -105,7 +105,7 @@ public class LayoutGridService {
|
||||
Color color = switch (semanticNode.getType()) {
|
||||
case PARAGRAPH -> PARAGRAPH_COLOR;
|
||||
case TABLE -> TABLE_COLOR;
|
||||
case SECTION -> SECTION_COLOR;
|
||||
case SECTION, SUPER_SECTION -> SECTION_COLOR;
|
||||
case HEADLINE -> HEADLINE_COLOR;
|
||||
case HEADER, FOOTER -> HEADER_COLOR;
|
||||
case IMAGE -> IMAGE_COLOR;
|
||||
@ -119,7 +119,7 @@ public class LayoutGridService {
|
||||
if (isNotSectionOrTableCellOrDocument(semanticNode)) {
|
||||
addAsRectangle(semanticNode, layoutGrid, color);
|
||||
}
|
||||
if (semanticNode.getType().equals(NodeType.SECTION)) {
|
||||
if (semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION)) {
|
||||
addSection(semanticNode, layoutGrid, color);
|
||||
}
|
||||
if (semanticNode.getType().equals(NodeType.TABLE)) {
|
||||
@ -193,10 +193,11 @@ public class LayoutGridService {
|
||||
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
|
||||
.toList();
|
||||
Page firstPage = semanticNode.getFirstPage();
|
||||
String treeIdString = buildTreeIdString(semanticNode);
|
||||
if (!subSections.isEmpty()) {
|
||||
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid);
|
||||
addPlacedText(firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid);
|
||||
} else {
|
||||
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, buildTreeIdString(semanticNode), layoutGrid)));
|
||||
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, treeIdString, layoutGrid)));
|
||||
}
|
||||
if (bBoxMap.values().size() == 1) {
|
||||
Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH);
|
||||
|
||||
@ -112,8 +112,8 @@ public class PdfVisualisationUtility {
|
||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
case SUPER_SECTION, SECTION -> Color.BLACK;
|
||||
case HEADLINE -> Color.RED;
|
||||
case SECTION -> Color.BLACK;
|
||||
case TABLE -> Color.ORANGE;
|
||||
case TABLE_CELL -> Color.GRAY;
|
||||
case IMAGE -> Color.MAGENTA;
|
||||
|
||||
@ -32,6 +32,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
@ -37,8 +37,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
@ -62,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
tableServiceResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", "document"));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -134,6 +133,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTableAndCellRotations() {
|
||||
|
||||
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
|
||||
@ -141,7 +141,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||
@ -151,15 +150,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
var tables = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList();
|
||||
|
||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||
@ -199,15 +202,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
@ -225,23 +232,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
@ -266,23 +279,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
@ -307,23 +326,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
@ -818,10 +843,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
private void toHtml(ClassificationDocument document, String filename) {
|
||||
|
||||
var tables = document.getSections()
|
||||
var tables = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
@ -843,12 +870,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
|
||||
TablePageBlock table = document.getSections()
|
||||
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
@ -870,10 +900,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||
|
||||
TablePageBlock table = document.getSections()
|
||||
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
@ -896,10 +928,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||
|
||||
assertThat(document.getSections()
|
||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.toList().size()).isEqualTo(tableSize);
|
||||
|
||||
}
|
||||
|
||||
@ -93,6 +93,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
.toList();
|
||||
|
||||
for (String pdfFileName : pdfFileNames) {
|
||||
|
||||
writeJsons(Path.of(pdfFileName));
|
||||
}
|
||||
}
|
||||
@ -102,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",filename.toFile().toString())));
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -229,7 +229,7 @@ public class PdfDraw {
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
case HEADLINE -> Color.RED;
|
||||
case SECTION -> Color.BLACK;
|
||||
case SECTION, SUPER_SECTION -> Color.BLACK;
|
||||
case TABLE -> Color.ORANGE;
|
||||
case TABLE_CELL -> Color.GRAY;
|
||||
case IMAGE -> Color.MAGENTA;
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user