Merge branch 'main' into 'RED-3813'

# Conflicts:
#   layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
This commit is contained in:
Yannik Hampe 2024-06-26 09:10:59 +02:00
commit 39f527a57c
64 changed files with 2696 additions and 590 deletions

View File

@ -2,5 +2,6 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
public enum LayoutEngine {
ALGORITHM,
AI
AI,
OUTLINE
}

View File

@ -6,6 +6,7 @@ import java.util.Locale;
public enum NodeType implements Serializable {
DOCUMENT,
SECTION,
SUPER_SECTION,
HEADLINE,
PARAGRAPH,
TABLE,

View File

@ -14,7 +14,7 @@ import lombok.NoArgsConstructor;
public class SimplifiedSectionText {
@Schema(description = "The number of this Section. This is used to map the simplified section text back to the original Section.")
private int sectionNumber;
private String sectionNumber;
@Schema(description = "The text in this Section.")
private String text;

View File

@ -4,6 +4,7 @@ public enum LayoutParsingType {
REDACT_MANAGER,
REDACT_MANAGER_OLD,
REDACT_MANAGER_PARAGRAPH_DEBUG,
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
DOCUMINE,
DOCUMINE_OLD,
CLARIFYND,

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
@ -29,6 +30,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -46,6 +52,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBui
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
@ -90,12 +97,16 @@ public class LayoutParsingPipeline {
TableExtractionService tableExtractionService;
DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService;
BlockificationPostprocessingService blockificationPostprocessingService;
DocstrumBlockificationService docstrumBlockificationService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
ClarifyndClassificationService clarifyndClassificationService;
GraphicExtractorService graphicExtractorService;
OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
TOCEnrichmentService tocEnrichmentService;
LayoutparserSettings settings;
@ -105,8 +116,8 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
// File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
File viewerDocumentFile = originFile;
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId()
@ -209,15 +220,15 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -232,6 +243,7 @@ public class LayoutParsingPipeline {
PDDocument originDocument = openDocument(originFile);
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
@ -242,6 +254,12 @@ public class LayoutParsingPipeline {
}
List<ClassificationPage> classificationPages = new ArrayList<>();
OutlineObject lastProcessedOutlineObject = null;
// parsing the structure elements could be useful as well
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
}
long pageCount = originDocument.getNumberOfPages();
@ -287,7 +305,13 @@ public class LayoutParsingPipeline {
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
pdPage,
pageNumber,
cleanRulings,
stripper.getTextPositionSequences(),
false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
@ -302,10 +326,16 @@ public class LayoutParsingPipeline {
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words,
cleanRulings,
true,
classificationDocument.getVisualizations(),
layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words,
cleanRulings,
false,
classificationDocument.getVisualizations(),
layoutParsingType);
};
classificationPage.setCleanRulings(cleanRulings);
@ -315,6 +345,20 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
notFoundOutlineObject = lastProcessedOutlineObject;
}
if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects);
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
}
}
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
@ -350,20 +394,27 @@ public class LayoutParsingPipeline {
}
log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) {
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
redactManagerClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument(
classificationDocument);
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
}
List<TextPageBlock> headlines = classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
classificationDocument.setTableOfContents(tableOfContents);
log.info("Building Sections for {}", identifier);
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> {
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
}
default -> tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
}
return classificationDocument;

View File

@ -1,9 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.Set;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -20,6 +21,9 @@ public abstract class AbstractPageBlock extends BoundingBox {
@JsonIgnore
protected PageBlockType classification;
Set<LayoutEngine> engines = new HashSet<>();
@JsonIgnore
protected int page;

View File

@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
@ -28,4 +30,7 @@ public class ClassificationDocument {
private long rulesVersion;
private OutlineObjectTree outlineObjectTree;
private TableOfContents tableOfContents;
}

View File

@ -8,13 +8,13 @@ import java.util.Map;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
@Data
@RequiredArgsConstructor
@ -23,6 +23,10 @@ public class ClassificationPage {
@NonNull
private List<AbstractPageBlock> textBlocks;
private List<OutlineObject> outlineObjects = new ArrayList<>();
private List<AbstractPageBlock> headlines = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private Rectangle bodyTextFrame;

View File

@ -12,6 +12,7 @@ import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@Deprecated
public class ClassificationSection {
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();

View File

@ -31,6 +31,19 @@ public enum PageBlockType {
}
public static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1 -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
case H5 -> 5;
default -> 6;
};
}
public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);

View File

@ -8,6 +8,7 @@ import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@ -16,13 +17,15 @@ public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
private enum Format {
public enum Format {
EMPTY,
NUMERICAL,
DOCUMENT
}
@Getter
Format format;
@Getter
String identifierString;
List<Integer> identifiers;
boolean asChild;

View File

@ -140,8 +140,8 @@ public class DocumentTree {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
Entry entry = root;
for (int id : treeId) {
entry = entry.children.get(id);
}
return entry;

View File

@ -0,0 +1,74 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public abstract class AbstractSemanticNode implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + getType() + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -3,43 +3,35 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Document implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
public class Document extends AbstractSemanticNode {
Set<Page> pages;
DocumentTree documentTree;
Integer numberOfPages;
TextBlock textBlock;
@Builder.Default
Set<RedactionEntity> entities = new HashSet<>();
LayoutparsingVisualizations visualizations;
@ -51,15 +43,27 @@ public class Document implements GenericSemanticNode {
}
public TextBlock getTextBlock() {
/**
* Gets the sections of the document as a list.
*
* @return A list of all sections within the document.
*/
public List<Section> getAllSections() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the main sections of the document as a list.
*
* @return A list of main sections within the document
* @deprecated This method is marked for removal.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
*/
@Deprecated(forRemoval = true)
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
@ -67,6 +71,18 @@ public class Document implements GenericSemanticNode {
}
/**
* Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects.
*
* @return A list of all children of type SECTION or SUPER_SECTION.
*/
public List<SemanticNode> getChildrenOfTypeSectionOrSuperSection() {
return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION))
.toList();
}
public List<Header> getHeaders() {
return streamChildrenOfType(NodeType.HEADER).map(node -> (Header) node)
@ -81,6 +97,15 @@ public class Document implements GenericSemanticNode {
}
@Override
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElse(Headline.builder().build());
}
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf)
@ -102,18 +127,9 @@ public class Document implements GenericSemanticNode {
}
@Override
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElse(Headline.builder().build());
}
private Stream<SemanticNode> streamAllNodes() {
return documentTree.allEntriesInOrder()
return getDocumentTree().allEntriesInOrder()
.map(DocumentTree.Entry::getNode);
}

View File

@ -20,7 +20,8 @@ public class DuplicatedParagraph extends Paragraph {
@Override
public TextBlock getTextBlock() {
return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
return Stream.of(super.getLeafTextBlock(), unsortedLeafTextBlock)
.collect(new TextBlockCollector());
}

View File

@ -1,48 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Footer implements GenericSemanticNode {
public class Footer extends AbstractSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
@ -68,17 +44,7 @@ public class Footer implements GenericSemanticNode {
@Override
public String toString() {
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -1,47 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Header implements GenericSemanticNode {
public class Header extends AbstractSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public boolean isLeaf() {
@ -67,17 +44,7 @@ public class Header implements GenericSemanticNode {
@Override
public String toString() {
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -1,47 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Headline implements GenericSemanticNode {
public class Headline extends AbstractSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
@ -67,7 +44,7 @@ public class Headline implements GenericSemanticNode {
@Override
public String toString() {
return treeId + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
return getTreeId() + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
}
@ -77,14 +54,4 @@ public class Headline implements GenericSemanticNode {
return this;
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -3,15 +3,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
@ -21,18 +16,16 @@ import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Image implements GenericSemanticNode {
public class Image extends AbstractSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
String id;
String representationHash;
@ -55,13 +48,6 @@ public class Image implements GenericSemanticNode {
@EqualsAndHashCode.Exclude
Page page;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
@ -87,7 +73,7 @@ public class Image implements GenericSemanticNode {
@Override
public String toString() {
return treeId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
return getTreeId() + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
}

View File

@ -1,20 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@ -23,25 +13,12 @@ import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PROTECTED)
public class Paragraph implements GenericSemanticNode {
public class Paragraph extends AbstractSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
@ -63,21 +40,4 @@ public class Paragraph implements GenericSemanticNode {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,47 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import lombok.experimental.SuperBuilder;
@Slf4j
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Section implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@EqualsAndHashCode(callSuper = true)
public class Section extends AbstractSemanticNode {
@Override
public NodeType getType() {
@ -50,6 +23,14 @@ public class Section implements GenericSemanticNode {
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(() -> getParent().getHeadline());
}
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
@ -57,39 +38,10 @@ public class Section implements GenericSemanticNode {
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
return getTreeId() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -0,0 +1,40 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class SuperSection extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.SUPER_SECTION;
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(() -> getParent().getHeadline());
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.SUPER_SECTION + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -2,34 +2,26 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableCell implements GenericSemanticNode {
public class TableCell extends AbstractSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
int row;
int col;
boolean header;
@ -40,13 +32,6 @@ public class TableCell implements GenericSemanticNode {
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public Map<Page, Rectangle2D> getBBox() {
@ -96,7 +81,7 @@ public class TableCell implements GenericSemanticNode {
@Override
public String toString() {
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -50,14 +50,16 @@ public class ConcatenatedTextBlock implements TextBlock {
public ConcatenatedTextBlock concat(TextBlock textBlock) {
int start = textBlock.getBoundary().start();
int end = textBlock.getBoundary().end();
if (this.atomicTextBlocks.isEmpty()) {
boundary.setStart(textBlock.getBoundary().start());
boundary.setEnd(textBlock.getBoundary().end());
} else if (boundary.end() != textBlock.getBoundary().start()) {
boundary.setStart(start);
boundary.setEnd(end);
} else if (boundary.end() != start) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
boundary.setEnd(textBlock.getBoundary().end());
boundary.setEnd(end);
this.searchText = null;
return this;
}

View File

@ -0,0 +1,209 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitHeightDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitRectangleDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitWidthDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.springframework.stereotype.Service;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineExtractorService {
private static final String PDDESTINATION_TYPE_FIT = "Fit";
private static final String PDDESTINATION_TYPE_FIT_B = "FitB";
private static final String PDDESTINATION_TYPE_FIT_H = "FitH";
private static final String PDDESTINATION_TYPE_FIT_V = "FitV";
private static final String PDDESTINATION_TYPE_FIT_R = "FitR";
private static final String PDDESTINATION_TYPE_FIT_BH = "FitBH";
private static final String PDDESTINATION_TYPE_FIT_BV = "FitBV";
private static final String PDDESTINATION_TYPE_XYZ = "XYZ";
@SneakyThrows
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
if (documentOutline != null) {
for (PDOutlineItem child : documentOutline.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
outlineObjectWithChildren.ifPresent(rootNodes::add);
}
}
return new OutlineObjectTree(rootNodes);
}
@SneakyThrows
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
if (outlineObject.isPresent()) {
for (var child : item.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
}
}
return outlineObject;
}
// if the structure elements are processed beforehand, another case can be handled here as well:
// outline objects can reference structure elements (see pdf documentation)
@SneakyThrows
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
String title = item.getTitle();
PDPage page = item.findDestinationPage(document);
if (page == null) {
return Optional.empty();
}
int pageNumber = document.getPages().indexOf(page);
Optional<Point2D> outlinePosition = Optional.empty();
try {
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
PDDestinationNameTreeNode destinations = null;
if (names != null) {
destinations = names.getDests();
}
PDDestination destination = item.getDestination();
if (destination != null) {
outlinePosition = getLocationFromCOSBase(destinations, destination.getCOSObject());
}
if (outlinePosition.isEmpty()) {
PDAction action = item.getAction();
if (action != null) {
outlinePosition = extractOutlineLocationGoTo(destinations, action.getCOSObject());
}
}
} catch (Exception e) {
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
}
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth)));
}
@SneakyThrows
private static Optional<Point2D> extractOutlineLocationGoTo(PDDestinationNameTreeNode destinations, COSDictionary cosDictionary) {
if (isGoToAction(cosDictionary)) {
COSBase cosBase = cosDictionary.getItem(COSName.D);
return getLocationFromCOSBase(destinations, cosBase);
}
return Optional.empty();
}
private static Optional<Point2D> getLocationFromCOSBase(PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
if (cosBase != null) {
if (cosBase instanceof COSArray cosArray) {
return getLocationFromCosArray(cosArray);
}
if (cosBase instanceof COSString cosString) {
String destinationName = cosString.getString();
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
return getLocationFromCosArray(cosArray);
}
}
return Optional.empty();
}
private static Optional<Point2D> getLocationFromCosArray(COSArray cosArray) {
boolean located = false;
float x = 0;
float y = 0;
try {
PDDestination destination = PDDestination.create(cosArray);
COSName type = (COSName) cosArray.getObject(1);
String typeString = type.getName();
switch (typeString) {
case PDDESTINATION_TYPE_FIT_V:
case PDDESTINATION_TYPE_FIT_BV:
PDPageFitHeightDestination fitHeightDestination = (PDPageFitHeightDestination) destination;
x = fitHeightDestination.getLeft();
located = true;
break;
case PDDESTINATION_TYPE_FIT_R:
PDPageFitRectangleDestination fitRectangleDestination = (PDPageFitRectangleDestination) destination;
x = fitRectangleDestination.getLeft();
y = fitRectangleDestination.getTop();
located = true;
break;
case PDDESTINATION_TYPE_FIT_H:
case PDDESTINATION_TYPE_FIT_BH:
PDPageFitWidthDestination fitWidthDestination = (PDPageFitWidthDestination) destination;
y = fitWidthDestination.getTop();
located = true;
break;
case PDDESTINATION_TYPE_XYZ:
PDPageXYZDestination xyzDestination = (PDPageXYZDestination) destination;
x = xyzDestination.getLeft();
y = xyzDestination.getTop();
located = true;
break;
case PDDESTINATION_TYPE_FIT:
case PDDESTINATION_TYPE_FIT_B:
default:
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return located ? Optional.of(new Point2D.Float(x, y)) : Optional.empty();
}
private static boolean isGoToAction(COSDictionary cosDictionary) {
return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto");
}
}

View File

@ -0,0 +1,35 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
@AllArgsConstructor
public class OutlineObject {
private final String title;
private final int pageNumber;
private Point2D point;
private final int treeDepth;
private boolean found;
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
this(title, pageNumber, depth);
this.point = point2D;
}
@Override
public String toString() {
return "OutlineObject{" + "title='" + title + '\'' + '}';
}
}

View File

@ -0,0 +1,42 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class OutlineObjectTree {
private List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
private Map<Integer, List<OutlineObject>> outlineObjectsPerPage = new HashMap<>();
public OutlineObjectTree(List<OutlineObjectTreeNode> rootNodes) {
this.rootNodes = rootNodes;
flattenNodesAndGroupByPage(rootNodes);
}
private void flattenNodesAndGroupByPage(List<OutlineObjectTreeNode> outlineObjectTreeNodes) {
for (OutlineObjectTreeNode node : outlineObjectTreeNodes) {
int pageNumber = node.getOutlineObject().getPageNumber();
if (!this.outlineObjectsPerPage.containsKey(pageNumber)) {
outlineObjectsPerPage.put(pageNumber, new ArrayList<>());
}
outlineObjectsPerPage.get(pageNumber).add(node.getOutlineObject());
if (!node.getChildren().isEmpty()) {
flattenNodesAndGroupByPage(node.getChildren());
}
}
}
}

View File

@ -0,0 +1,34 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data
public class OutlineObjectTreeNode {
private OutlineObject outlineObject;
private List<OutlineObjectTreeNode> children = new ArrayList<>();
public OutlineObjectTreeNode(OutlineObject outlineObject) {
this.outlineObject = outlineObject;
}
public void addChild(OutlineObjectTreeNode outlineObject) {
children.add(outlineObject);
}
@Override
public String toString() {
return "OutlineObjectTreeNode{" + "outlineObject=" + outlineObject + '}';
}
}

View File

@ -0,0 +1,61 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineValidationService {
public TableOfContents createToC(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
TableOfContentItem last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new TableOfContentItem(current);
if (parentDepth == null) {
mainSections.add(tocItem);
lastItemsPerDepth = new HashMap<>();
depths = new TreeSet<>();
} else {
assert last != null;
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
}
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
last = tocItem;
lastItemsPerDepth.put(currentDepth, tocItem);
depths.add(currentDepth);
}
return new TableOfContents(mainSections);
}
}

View File

@ -0,0 +1,259 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class TOCEnrichmentService {
public void assignSectionBlocksAndImages(ClassificationDocument document) {
TableOfContents toc = document.getTableOfContents();
Iterator<TableOfContentItem> iterator = toc.iterator();
TableOfContentItem currentTOCItem = null;
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>();
TableOfContentItem currentSection = null;
boolean foundFirstHeadline = false;
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null;
List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>();
for (ClassificationPage page : document.getPages()) {
List<TableOfContentItem> currentPageTOCItems = new ArrayList<>();
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
if (current.getClassification() == null) {
continue;
}
current.setPage(page.getPageNumber());
if (current.getClassification().equals(PageBlockType.HEADER)) {
header.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals(PageBlockType.FOOTER)) {
footer.add((TextPageBlock) current);
continue;
}
if (current instanceof TablePageBlock table) {
if (previousTable != null) {
mergeTableMetadata(table, previousTable);
}
previousTable = table;
}
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
if (!foundFirstHeadline) {
foundFirstHeadline = true;
}
currentSection = currentTOCItem;
currentTOCItem.getSectionBlocks().add(current);
currentPageTOCItems.add(currentTOCItem);
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
} else if (!foundFirstHeadline) {
startBlocks.add(current);
} else {
currentSection.getSectionBlocks().add(current);
}
}
if (!currentPageTOCItems.isEmpty()) {
lastFoundTOCItems = currentPageTOCItems;
}
for (ClassifiedImage image : page.getImages()) {
Double xMin = null;
Double yMin = null;
Double xMax = null;
Double yMax = null;
for (TableOfContentItem tocItem : lastFoundTOCItems) {
var headline = tocItem.getHeadline();
if (headline.getPage() != page.getPageNumber()) {
continue;
}
if (headline.getMinX() < headline.getMaxX()) {
if (xMin == null || headline.getMinX() < xMin) {
xMin = headline.getMinX();
}
if (xMax == null || headline.getMaxX() > xMax) {
xMax = headline.getMaxX();
}
} else {
if (xMin == null || headline.getMaxX() < xMin) {
xMin = headline.getMaxX();
}
if (xMax == null || headline.getMinX() > xMax) {
xMax = headline.getMinX();
}
}
if (headline.getMinY() < headline.getMaxY()) {
if (yMin == null || headline.getMinY() < yMin) {
yMin = headline.getMinY();
}
if (yMax == null || headline.getMaxY() > yMax) {
yMax = headline.getMaxY();
}
} else {
if (yMin == null || headline.getMaxY() < yMin) {
yMin = headline.getMaxY();
}
if (yMax == null || headline.getMinY() > yMax) {
yMax = headline.getMinY();
}
}
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
tocItem.getImages().add(image);
image.setAppendedToSection(true);
break;
}
}
if (!image.isAppendedToSection()) {
log.debug("Image uses last found section");
if (!lastFoundTOCItems.isEmpty()) {
lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
} else {
startImages.add(image);
}
image.setAppendedToSection(true);
}
}
if (!header.isEmpty()) {
headers.add(new ClassificationHeader(header));
}
if (!footer.isEmpty()) {
footers.add(new ClassificationFooter(footer));
}
}
if (!startBlocks.isEmpty()) {
TableOfContentItem unassigned = new TableOfContentItem(null);
unassigned.setSectionBlocks(startBlocks);
unassigned.setImages(startImages);
document.getTableOfContents().getMainSections().add(0, unassigned);
}
document.setHeaders(headers);
document.setFooters(footers);
}
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty()
&& previousTable.getRowCount() == 1
&& previousTable.getRows()
.get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows()
.get(0)
.stream()
.map(cell -> {
Cell fakeCell = Cell.copy(cell);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})
.toList();
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows()
.get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
}
private boolean hasValidHeaderInformation(TablePageBlock table) {
return !hasInvalidHeaderInformation(table);
}
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows()
.stream()
.flatMap(row -> row.stream()
.filter(cell -> !cell.getHeaderCells().isEmpty()))
.findAny().isEmpty();
}
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows()
.get(i);
if (row.size() == 1) {
continue;
}
boolean allNonHeader = true;
for (Cell cell : row) {
if (cell.isHeaderCell()) {
allNonHeader = false;
break;
}
}
if (allNonHeader) {
return row;
}
}
return Collections.emptyList();
}
}

View File

@ -0,0 +1,109 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TableOfContentItem {
@EqualsAndHashCode.Include
private TextPageBlock headline;
private List<TableOfContentItem> children = new ArrayList<>();
private TableOfContentItem parent;
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private AbstractSemanticNode section;
public TableOfContentItem(TextPageBlock headline) {
this.headline = headline;
}
public void addChild(TableOfContentItem tableOfContentItem) {
children.add(tableOfContentItem);
tableOfContentItem.setParent(this);
}
public TableOfContentItem getSiblingBefore() {
if (parent != null) {
int index = parent.getChildren().indexOf(this);
if (index > 0) {
return parent.getChildren()
.get(index - 1);
}
}
return null;
}
public TableOfContentItem getSiblingAfter() {
if (parent != null) {
int index = parent.getChildren().indexOf(this);
if (index >= 0 && index < parent.getChildren().size() - 1) {
return parent.getChildren()
.get(index + 1);
}
}
return null;
}
public boolean contains(TextPageBlock block) {
if (headline.equals(block)) {
return true;
}
for (TableOfContentItem child : children) {
if (child.contains(block)) {
return true;
}
}
return false;
}
public boolean contains(TableOfContentItem tocItem) {
if (this.equals(tocItem)) {
return true;
}
for (TableOfContentItem child : children) {
if (child.contains(tocItem)) {
return true;
}
}
return false;
}
public List<AbstractPageBlock> getNonEmptySectionBlocks() {
return sectionBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList());
}
@Override
public String toString() {
return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}';
}
}

View File

@ -0,0 +1,136 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;
import org.springframework.lang.NonNull;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class TableOfContents implements Iterable<TableOfContentItem> {
private List<TableOfContentItem> mainSections = new ArrayList<>();
public TableOfContents(List<TableOfContentItem> mainSections) {
this.mainSections = mainSections;
}
public List<TextPageBlock> getAllTextPageBlocks() {
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
for (TableOfContentItem item : mainSections) {
collectTextPageBlocks(item, allTextPageBlocks);
}
return allTextPageBlocks;
}
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
textPageBlocks.add(item.getHeadline());
for (TableOfContentItem child : item.getChildren()) {
collectTextPageBlocks(child, textPageBlocks);
}
}
public List<TableOfContentItem> getAllTableOfContentItems() {
List<TableOfContentItem> allItems = new ArrayList<>();
for (TableOfContentItem item : mainSections) {
collectTableOfContentItems(item, allItems);
}
return allItems;
}
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
allItems.add(item);
for (TableOfContentItem child : item.getChildren()) {
collectTableOfContentItems(child, allItems);
}
}
private boolean containsBlock(TextPageBlock block) {
for (TableOfContentItem existingItem : this.getMainSections()) {
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
return true;
}
}
return false;
}
private boolean containsItem(TableOfContentItem tocItem) {
for (TableOfContentItem existingItem : this.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true;
}
}
return false;
}
@Override
public @NonNull Iterator<TableOfContentItem> iterator() {
return new TableOfContentItemIterator(mainSections);
}
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
stack.push(mainSections.iterator());
}
@Override
public boolean hasNext() {
ensureStackTopIsCurrent();
return !stack.isEmpty() && stack.peek().hasNext();
}
@Override
public TableOfContentItem next() {
ensureStackTopIsCurrent();
TableOfContentItem currentItem = stack.peek().next();
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
stack.push(currentItem.getChildren()
.iterator());
}
return currentItem;
}
private void ensureStackTopIsCurrent() {
while (!stack.isEmpty() && !stack.peek().hasNext()) {
stack.pop();
}
}
}
}

View File

@ -68,7 +68,7 @@ public class RedTextPosition extends BoundingBox {
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight()));
float textHeight = textPosition.getHeight() + HEIGHT_PADDING;
float textHeight = textPosition.getHeight() + 2 * HEIGHT_PADDING;
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),

View File

@ -58,6 +58,20 @@ public class TextPageBlock extends AbstractPageBlock {
}
@JsonIgnore
public float getPageHeight() {
return sequences.get(0).getPageHeight();
}
@JsonIgnore
public float getPageWidth() {
return sequences.get(0).getPageWidth();
}
private void calculateBBox() {
if (sequences == null) {
@ -69,6 +83,12 @@ public class TextPageBlock extends AbstractPageBlock {
}
public void recalculateBBox() {
calculateBBox();
}
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
if (textBlocksToMerge.isEmpty()) {

View File

@ -27,8 +27,10 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@Deprecated
public class SectionsBuilderService {
public void buildSections(ClassificationDocument document) {
List<AbstractPageBlock> chunkWords = new ArrayList<>();

View File

@ -9,7 +9,6 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import lombok.extern.slf4j.Slf4j;
@ -20,7 +19,7 @@ public class SimplifiedSectionTextService {
public SimplifiedText toSimplifiedText(Document document) {
List<SimplifiedSectionText> simplifiedMainSectionsList = document.getMainSections()
List<SimplifiedSectionText> simplifiedMainSectionsList = document.getAllSections()
.stream()
.map(this::toSimplifiedSectionText)
.toList();
@ -43,7 +42,9 @@ public class SimplifiedSectionTextService {
return SimplifiedSectionText.builder()
.sectionNumber(section.getTreeId()
.get(0))
.stream()
.map(String::valueOf)
.collect(Collectors.joining(".")))
.text(section.getTextBlock().getSearchText())
.build();
}

View File

@ -0,0 +1,533 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
@Service
public class BlockificationPostprocessingService {
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
.collect(RectangleTransformations.collectBBox()))
.collect(RectangleTransformations.collectBBox());
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) {
return null;
}
float pageHeight = classificationPage.getPageHeight();
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
if (notFoundOutlineObject != null) {
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
OutlineObject firstOutlineObject = null;
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
if (outlineObjectListIterator.hasNext()) {
firstOutlineObject = outlineObjectListIterator.next();
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
}
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
}
if (firstOutlineObject != null) {
// re-create the context for the updated blocks
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
}
}
outlineObjectListIterator.forEachRemaining(outlineObject -> {
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
});
if (!outlineObjects.isEmpty()) {
return outlineObjects.get(outlineObjects.size() - 1);
} else {
return notFoundOutlineObject;
}
}
private static List<TextPageBlock> getTextPageBlocks(ClassificationPage classificationPage) {
return classificationPage.getTextBlocks()
.stream()
.filter(block -> block instanceof TextPageBlock)
.map(block -> (TextPageBlock) block)
.toList();
}
private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) {
if (firstOutlineObjectProcessionContext == null) {
return false;
}
String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle();
String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle();
if (!firstTitle.startsWith(notFoundTitle)) {
return false;
}
var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext);
var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext);
double maxYFirst = blocksOfFirstOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY)
.max()
.orElse(Double.NEGATIVE_INFINITY);
return blocksOfNotFoundOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY)
.anyMatch(y -> y >= maxYFirst);
}
private List<TextPageBlock> getAllMatchingBlocks(OutlineProcessionContext context) {
List<TextPageBlock> blocks = new ArrayList<>();
if (context.getDirectMatch() != null) {
blocks.add(context.getDirectMatch());
}
if (context.getSplitCandidate() != null) {
blocks.add(context.getSplitCandidate());
}
blocks.addAll(context.getMergeCandidates());
return blocks;
}
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
OutlineObject outlineObject = context.getOutlineObject();
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
while (iterator.hasNext()) {
TextPageBlock pageBlock = iterator.next();
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
break;
}
}
if (iterator.hasPrevious()) {
iterator.previous();
}
boolean earlyStop = false;
while (iterator.hasNext() && !earlyStop) {
TextPageBlock pageBlock = iterator.next();
earlyStop = processOutlineForTextBlock(pageBlock, context);
}
}
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
OutlineObject outlineObject = context.outlineObject;
TextPageBlock directMatch = context.directMatch;
List<TextPageBlock> mergeCandidates = context.mergeCandidates;
TextPageBlock splitCandidate = context.splitCandidate;
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
double distanceToBestMergeCandidates = Double.MAX_VALUE;
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
if (!mergeCandidates.isEmpty()) {
// with this code adjacent blocks to the first and last merge candidate get added, this could be useful for some edge cases:
//List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
//addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
//if (mergeCandidates.size() > 1) {
// addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
//}
//allMergeCandidates = allMergeCandidates.stream()
// .distinct()
// .toList();
List<List<TextPageBlock>> combinations = findCombinations(outlineObject.getTitle(), mergeCandidates);
for (List<TextPageBlock> combination : combinations) {
double averageDistance = combination.stream()
.map(block -> calculateDistance(outlineObject, block))
.mapToDouble(Double::doubleValue).average()
.orElse(Double.MAX_VALUE);
if (distanceToBestMergeCandidates > averageDistance) {
distanceToBestMergeCandidates = averageDistance;
bestMergeCandidateCombination = combination;
}
}
}
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
if (minDistance == Double.MAX_VALUE) {
return false;
}
if (minDistance == distanceToDirectMatch) {
setClassificationAndAddOutlineEngine(directMatch, headlineType);
} else if (minDistance == distanceToSplitCandidate) {
SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
if (splitBlockResult.modifiedBlockToSplit) {
setClassificationAndAddOutlineEngine(splitCandidate, headlineType);
}
splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null));
} else {
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
setClassificationAndAddOutlineEngine(merged, headlineType);
}
return true;
}
private static void setClassificationAndAddOutlineEngine(TextPageBlock block, PageBlockType headlineType) {
block.setClassification(headlineType);
block.getEngines().add(LayoutEngine.OUTLINE);
}
private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
List<TextPageBlock> otherBlocks = new ArrayList<>();
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
String headline = title;
if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) {
headline = sectionIdentifier + headline;
}
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) {
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
}
boolean modifiedBlockToSplit = false;
if (!wordSequenceResult.inSequence.isEmpty()) {
blockToSplit.setSequences(wordSequenceResult.inSequence);
blockToSplit.recalculateBBox();
modifiedBlockToSplit = true;
}
if (!wordSequenceResult.preSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
classificationPage.getTextBlocks().add(blockToSplitIdx, block);
otherBlocks.add(block);
blockToSplitIdx++;
}
if (!wordSequenceResult.postSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.postSequence, 0);
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
otherBlocks.add(block);
}
return new SplitBlockResult(modifiedBlockToSplit, otherBlocks);
}
private static WordSequenceResult findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
String target = sanitizeString(text);
List<TextPositionSequence> inSequence = new ArrayList<>();
List<TextPositionSequence> preSequence = new ArrayList<>();
List<TextPositionSequence> postSequence = new ArrayList<>();
StringBuilder currentSequence = new StringBuilder();
for (TextPositionSequence sequence : textPositionSequences) {
currentSequence.append(sanitizeString(sequence.toString()));
inSequence.add(sequence);
if (currentSequence.length() >= target.length()) {
if (currentSequence.toString().endsWith(target)) {
int index = 0;
String toRemove = currentSequence.substring(0, currentSequence.length() - target.length());
TextPositionSequence next = inSequence.get(index);
while (currentSequence.length() - next.length() >= target.length()) {
TextPositionSequence removed = inSequence.remove(index);
currentSequence.delete(0, removed.toString().length());
preSequence.add(removed);
next = inSequence.get(index);
toRemove = toRemove.substring(removed.length());
}
if (!toRemove.isEmpty()) {
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
currentSequence.delete(0, splitSequenceResult.out.length());
preSequence.add(splitSequenceResult.out);
inSequence.add(index, splitSequenceResult.in);
}
} else if (currentSequence.toString().startsWith(target)) {
int index = inSequence.size() - 1;
String toRemove = currentSequence.substring(target.length());
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
currentSequence.delete(currentSequence.length() - splitSequenceResult.out.length(), currentSequence.length());
inSequence.add(index, splitSequenceResult.in);
postSequence.add(splitSequenceResult.out);
}
if (currentSequence.toString().equals(target)) {
postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size()));
return new WordSequenceResult(inSequence, preSequence, postSequence);
}
}
}
return new WordSequenceResult();
}
private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) {
TextPositionSequence in = null;
TextPositionSequence out;
String currentSequence = sequence.toString();
int index = currentSequence.indexOf(toRemove);
int endIndex = index + toRemove.length();
out = createSubSequence(sequence, index, endIndex);
if (index > 0) {
in = createSubSequence(sequence, 0, index);
} else if (endIndex < sequence.getTextPositions().size()) {
in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size());
}
return new SplitSequenceResult(in, out);
}
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
newSeq.setParagraphStart(sequence.isParagraphStart());
return newSeq;
}
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
TextPageBlock firstBlock = blocksToMerge.get(0);
if (blocksToMerge.size() > 1) {
List<TextPageBlock> mergedBlocks = new ArrayList<>();
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
if (textPageBlock.getDir() == firstBlock.getDir()) {
firstBlock.getSequences().addAll(textPageBlock.getSequences());
mergedBlocks.add(textPageBlock);
}
}
}
assert firstBlock != null;
firstBlock.setToDuplicate(false);
firstBlock.recalculateBBox();
classificationPage.getTextBlocks().removeAll(mergedBlocks);
}
return firstBlock;
}
private static List<List<TextPageBlock>> findCombinations(String title, List<TextPageBlock> blocks) {
List<List<TextPageBlock>> combinations = new ArrayList<>();
findCombinations(title, blocks, new ArrayList<>(), combinations);
return combinations;
}
private static void findCombinations(String title, List<TextPageBlock> blocks, List<TextPageBlock> current, List<List<TextPageBlock>> combinations) {
String target = sanitizeString(title);
if (target.isEmpty()) {
combinations.add(new ArrayList<>(current));
return;
}
List<TextPageBlock> remaining = blocks.stream()
.filter(block -> !current.contains(block))
.toList();
for (TextPageBlock block : remaining) {
String prefix = sanitizeString(block.getText());
if (target.startsWith(prefix)) {
current.add(block);
findCombinations(target.substring(prefix.length()), blocks.subList(blocks.indexOf(block) + 1, blocks.size()), current, combinations);
current.remove(current.size() - 1);
}
}
}
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY();
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
}
// currently only three cases are handled here:
// 1. equality
// 2. outline title contains block text
// 3. block text contains outline title
// another possible case is an intersection, meaning a title is split up between two different blocks
// this should not happen with how docstrum creates the blocks
// if it is indeed necessary, a splitting has to be done with a follow-up merge
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
OutlineObject outlineObject = context.getOutlineObject();
String blockText = sanitizeString(pageBlock.getText());
String outlineTitle = sanitizeString(outlineObject.getTitle());
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
if (!blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
return false;
}
if (blockText.equals(outlineTitle) && context.directMatch == null) {
context.directMatch = pageBlock;
return true;
}
if (outlineTitleContainsBlockText) {
context.mergeCandidates.add(pageBlock);
}
if (blockTextContainsOutlineTitle) {
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) {
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
context.directMatch = pageBlock;
return true;
} else if (context.splitCandidate == null) {
context.sectionIdentifier = sectionIdentifier;
}
}
if (context.splitCandidate == null) {
context.splitCandidate = pageBlock;
}
}
return false;
}
private static String sanitizeString(String text) {
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
}
@Data
private static class OutlineProcessionContext {
private TextPageBlock directMatch;
private OutlineObject outlineObject;
private List<TextPageBlock> mergeCandidates;
private TextPageBlock splitCandidate;
private SectionIdentifier sectionIdentifier;
OutlineProcessionContext(OutlineObject outlineObject) {
this.outlineObject = outlineObject;
this.directMatch = null;
this.mergeCandidates = new ArrayList<>();
this.splitCandidate = null;
this.sectionIdentifier = SectionIdentifier.empty();
}
}
public static class WordSequenceResult {
public List<TextPositionSequence> inSequence;
public List<TextPositionSequence> preSequence;
public List<TextPositionSequence> postSequence;
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
this.inSequence = inSequence;
this.preSequence = preSequence;
this.postSequence = postSequence;
}
public WordSequenceResult() {
this.inSequence = new ArrayList<>();
this.preSequence = new ArrayList<>();
this.postSequence = new ArrayList<>();
}
}
public record SplitBlockResult(boolean modifiedBlockToSplit, List<TextPageBlock> otherBlocks) {
}
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
}
}

View File

@ -26,6 +26,7 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class DocstrumBlockificationService {
public static final float Y_THRESHOLD = 5f;
private final DocstrumSegmentationService docstrumSegmentationService;
static final float THRESHOLD = 1f;
@ -58,8 +59,10 @@ public class DocstrumBlockificationService {
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
combineBlocks(classificationPage);
if (layoutParsingType == LayoutParsingType.DOCUMINE
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) {
combineBlocks(classificationPage, layoutParsingType);
}
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
@ -105,7 +108,7 @@ public class DocstrumBlockificationService {
}
public void combineBlocks(ClassificationPage page) {
public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) {
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
@ -126,8 +129,19 @@ public class DocstrumBlockificationService {
continue;
}
if (current.isHeadline() || previous.isHeadline()) {
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, false);
} else {
previous = current;
}
continue;
}
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, true);
// previous = combineBlocksAndResetIterator(previous, current, itty, true);
previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
continue;
}
@ -150,7 +164,7 @@ public class DocstrumBlockificationService {
previous = current;
}
mergeIntersectingBlocks(page, usedRulings, 0, 6.5f);
mergeIntersectingBlocks(page, usedRulings, 0, Y_THRESHOLD);
}
@ -172,6 +186,12 @@ public class DocstrumBlockificationService {
}
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
}
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
@ -185,6 +205,9 @@ public class DocstrumBlockificationService {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate);
if (current.getClassification() != null && previous.getClassification() == null) {
previous.setClassification(current.getClassification());
}
itty.remove();
itty.previous();
itty.set(previous);
@ -244,21 +267,30 @@ public class DocstrumBlockificationService {
continue;
}
if (block.getClassification() != null && block.getClassification().isHeadline()) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
for (int i = 0; i < blocks.size(); i++) {
if (blocks.get(i) == null) {
AbstractPageBlock abstractPageBlock = blocks.get(i);
if (abstractPageBlock == null) {
continue;
}
if (blocks.get(i) == current) {
if (abstractPageBlock == current) {
continue;
}
if (blocks.get(i) instanceof TablePageBlock) {
if (abstractPageBlock instanceof TablePageBlock) {
continue;
}
TextPageBlock inner = (TextPageBlock) blocks.get(i);
if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
continue;
}
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
if (usedRulings.lineBetween(current, blocks.get(i))) {
continue;
@ -285,7 +317,7 @@ public class DocstrumBlockificationService {
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
return new TextPageBlock(wordBlockList);
}

View File

@ -27,26 +27,32 @@ public class ClarifyndClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
HeadlineClassificationService headlineClassificationService = new HeadlineClassificationService();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
classifyPage(headlineClassificationService, page, document, headlineFontSizes);
}
}
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
classifyBlock(headlineClassificationService, (TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
@ -79,7 +85,8 @@ public class ClarifyndClassificationService {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
}
}
@ -89,7 +96,8 @@ public class ClarifyndClassificationService {
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {

View File

@ -24,9 +24,9 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class DocuMineClassificationService {
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern AT_LEAST_3_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
public void classifyDocument(ClassificationDocument document) {
@ -35,44 +35,52 @@ public class DocuMineClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
HeadlineClassificationService headlineClassificationService = new HeadlineClassificationService();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
classifyPage(headlineClassificationService, page, document, headlineFontSizes);
}
}
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyPage(HeadlineClassificationService headlineClassificationService,
ClassificationPage page,
ClassificationDocument document,
List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
classifyBlock(headlineClassificationService, (TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyBlock(HeadlineClassificationService headlineClassificationService,
TextPageBlock textBlock,
ClassificationPage page,
ClassificationDocument document,
List<Float> headlineFontSizes) {
log.debug("headlineFontSizes: {}", headlineFontSizes);
var bodyTextFrame = page.getBodyTextFrame();
Matcher matcher = pattern.matcher(textBlock.toString());
Matcher matcher2 = pattern2.matcher(textBlock.toString());
Matcher matcher3 = pattern3.matcher(textBlock.toString());
Matcher headlineWithIdentifierMatcher = HEADLINE_WITH_IDENTIFER_PATTERN.matcher(textBlock.toString());
Matcher atLeast3Matcher = AT_LEAST_3_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSlashesMatcher = HEADLINE_PATTTERN_WITH_SLASHES.matcher(textBlock.toString());
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
== null
|| textBlock.getHighestFontSize()
<= document.getFontSizeCounter()
.getMostPopular()))
|| HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) {
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) //
&& (document.getFontSizeCounter().getMostPopular() == null //
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()))) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
@ -95,19 +103,28 @@ public class DocuMineClassificationService {
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
.contains(":")
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|| textBlock.toString().startsWith("APPENDIX")
|| textBlock.toString().startsWith("FIGURE")
&& (textBlock.getMostPopularWordStyle().contains("bold")
&& Character.isDigit(textBlock.toString().charAt(0))
&& atLeast3Matcher.reset().find()
&& !textBlock.toString().contains(":") //
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") //
|| textBlock.toString().startsWith("APPENDIX") //
|| textBlock.toString().startsWith("FIGURE") //
|| textBlock.toString().startsWith("Continued TABLE") //
|| textBlock.toString().startsWith("TABLE"))
&& !textBlock.toString().endsWith(":")
&& matcher2.reset().find()) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
&& atLeast3Matcher.reset().find()) {
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
} else if (headlineWithIdentifierMatcher.reset().find()
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
&& atLeast3Matcher.reset().find()
&& !headlineWithSlashesMatcher.reset().matches()) {
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()

View File

@ -0,0 +1,52 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Getter;
import lombok.Setter;
@Getter
@Setter
public class HeadlineClassificationService {
TextPageBlock lastHeadline;
PageBlockType originalClassifiedBlockType;
TextPageBlock lastHeadlineFromOutline;
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
this.setLastHeadline(lastHeadlineFromOutline);
}
public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) {
TextPageBlock lastHeadline = getLastHeadline();
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
PageBlockType finalHeadlineType = initialHeadlineType;
if (lastHeadline != null) {
if (lastHeadline.equals(lastHeadlineFromOutline)) {
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
PageBlockType lastHeadlineType = lastHeadline.getClassification();
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference);
}
}
setOriginalClassifiedBlockType(initialHeadlineType);
textBlock.setClassification(finalHeadlineType);
setLastHeadline(textBlock);
}
}

View File

@ -22,32 +22,39 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RedactManagerClassificationService {
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
HeadlineClassificationService headlineClassificationService = new HeadlineClassificationService();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
classifyPage(headlineClassificationService, page, document, headlineFontSizes);
}
}
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
classifyBlock(headlineClassificationService, (TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
@ -60,58 +67,64 @@ public class RedactManagerClassificationService {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
&& (textBlock.getMostPopularWordStyle().equals("bold")
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
&& textBlock.getSequences()
.get(0).getTextPositions()
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
} else if (!textBlock.getText().startsWith("Figure ")
&& PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
&& textBlock.getSequences()
.get(0).getTextPositions()
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("italic")
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);

View File

@ -11,6 +11,7 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
@ -30,9 +31,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@ -59,11 +61,6 @@ public class DocumentGraphFactory {
document.getPages()
.forEach(context::buildAndAddPageWithCounter);
document.getSections()
.stream()
.flatMap(section -> section.getImages()
.stream())
.forEach(image -> context.getImages().add(image));
addSections(layoutParsingType, document, context, documentGraph);
addHeaderAndFooterToEachPage(document, context);
@ -77,8 +74,17 @@ public class DocumentGraphFactory {
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
classificationDocument.getSections()
.forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
Optional<AbstractSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
parent,
tocItem.getChildren().isEmpty(),
tocItem.getNonEmptySectionBlocks(),
tocItem.getImages(),
context,
document);
tocItem.setSection(section.orElse(null));
}
}
@ -118,6 +124,7 @@ public class DocumentGraphFactory {
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
node.setLeafTextBlock(textBlock);
node.setTreeId(treeId);
node.getEngines().addAll(originalTextBlock.getEngines());
}
@ -184,10 +191,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
footer,
context,
page);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
@ -239,7 +243,7 @@ public class DocumentGraphFactory {
DocumentTree documentTree;
Map<Page, Integer> pages;
List<Section> sections;
List<AbstractSemanticNode> sections;
List<ClassifiedImage> images;
TextBlockFactory textBlockFactory;

View File

@ -9,14 +9,17 @@ import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -27,12 +30,13 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class SectionNodeFactory {
public void addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
Document document) {
public Optional<AbstractSemanticNode> addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
boolean isLeaf,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
Document document) {
// This is for the case where we have images on a page without any text/footer/header.
// The pageBlocks list is empty, but we still need to add those images to the document.
@ -40,16 +44,23 @@ public class SectionNodeFactory {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
return;
return Optional.empty();
}
if (pageBlocks.isEmpty()) {
return;
return Optional.empty();
}
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
AbstractSemanticNode section;
boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks);
if (isLeaf && !containsTablesAndTextBlocks) {
section = Section.builder().documentTree(context.getDocumentTree()).build();
} else {
section = SuperSection.builder().documentTree(context.getDocumentTree()).build();
}
context.getSections().add(section);
blocksPerPage.keySet()
@ -58,13 +69,16 @@ public class SectionNodeFactory {
section.setTreeId(getTreeId(parentNode, context, section));
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
if (containsTablesAndTextBlocks(pageBlocks)) {
if (containsTablesAndTextBlocks) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
section,
true,
subSectionPageBlocks,
emptyList(),
context,
document));
} else if (!isLeaf) {
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
} else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
}
@ -72,10 +86,12 @@ public class SectionNodeFactory {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
return Optional.of(section);
}
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, AbstractSemanticNode section) {
if (parentNode == null) {
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
@ -88,7 +104,7 @@ public class SectionNodeFactory {
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
Section section,
AbstractSemanticNode section,
Document document) {
if (pageBlocks.get(0).isHeadline()) {
@ -101,7 +117,7 @@ public class SectionNodeFactory {
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
Section section,
AbstractSemanticNode section,
Document document) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
@ -117,7 +133,7 @@ public class SectionNodeFactory {
if (abstractPageBlock instanceof TextPageBlock) {
switch (layoutParsingType) {
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> {
alreadyMerged.add(abstractPageBlock);
remainingBlocks.remove(abstractPageBlock);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
@ -226,7 +242,7 @@ public class SectionNodeFactory {
}
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, AbstractSemanticNode section, Integer pageNumber) {
Page page = context.getPage(pageNumber);
page.getMainBody().add(section);

View File

@ -154,10 +154,11 @@ public class TableNodeFactory {
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType,
tableCell,
true,
cell.getTextBlocks()
.stream()
.map(tb -> (AbstractPageBlock) tb)
.toList(),
.collect(Collectors.toList()),
emptyList(),
context,
document);

View File

@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
@ -62,6 +63,7 @@ public class DocumentGraphMapper {
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case SUPER_SECTION -> buildSuperSection(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);
@ -109,7 +111,7 @@ public class DocumentGraphMapper {
private TableCell buildTableCell(Context context, Map<String, String> properties) {
TableCell.TableCellBuilder builder = TableCell.builder();
TableCell.TableCellBuilder<?, ?> builder = TableCell.builder();
PropertiesMapper.parseTableCellProperties(properties, builder);
return builder.documentTree(context.documentTree).build();
}
@ -140,6 +142,11 @@ public class DocumentGraphMapper {
return Section.builder().documentTree(context.documentTree).build();
}
private SuperSection buildSuperSection(Context context) {
return SuperSection.builder().documentTree(context.documentTree).build();
}
private Paragraph buildParagraph(Context context, Map<String, String> properties) {

View File

@ -9,6 +9,7 @@ import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@ -23,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
@ -62,6 +64,12 @@ public class LayoutGridService {
static Color HEADER_COLOR = new Color(171, 131, 6);
static Color IMAGE_COLOR = new Color(253, 63, 146);
private record RectangleIdentifier(List<Integer> treeId, Integer pageNumber) {
}
HashMap<RectangleIdentifier, Rectangle2D> rectangleMap = new HashMap<>();
@SneakyThrows
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
@ -105,7 +113,7 @@ public class LayoutGridService {
Color color = switch (semanticNode.getType()) {
case PARAGRAPH -> PARAGRAPH_COLOR;
case TABLE -> TABLE_COLOR;
case SECTION -> SECTION_COLOR;
case SECTION, SUPER_SECTION -> SECTION_COLOR;
case HEADLINE -> HEADLINE_COLOR;
case HEADER, FOOTER -> HEADER_COLOR;
case IMAGE -> IMAGE_COLOR;
@ -119,7 +127,7 @@ public class LayoutGridService {
if (isNotSectionOrTableCellOrDocument(semanticNode)) {
addAsRectangle(semanticNode, layoutGrid, color);
}
if (semanticNode.getType().equals(NodeType.SECTION)) {
if (semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION)) {
addSection(semanticNode, layoutGrid, color);
}
if (semanticNode.getType().equals(NodeType.TABLE)) {
@ -165,8 +173,10 @@ public class LayoutGridService {
List<Double> ys = yStream.collect(Collectors.toList());
ys.remove(0);
Rectangle2D tableBBox = table.getBBox().get(page);
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredLines();
Rectangle2D tableBBox = table.getBBox()
.get(page);
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages()
.get(page.getNumber() - 1).getColoredLines();
xs.forEach(x -> {
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
@ -190,24 +200,20 @@ public class LayoutGridService {
private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
.toList();
Integer maxChildDepth = subSections.stream()
.map(node -> node.getTreeId().size())
.max(Integer::compareTo)
.orElse(semanticNode.getTreeId().size());
int ownDepth = semanticNode.getTreeId().size();
Page firstPage = semanticNode.getFirstPage();
if (!subSections.isEmpty()) {
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid);
} else {
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, buildTreeIdString(semanticNode), layoutGrid)));
}
String treeIdString = buildTreeIdString(semanticNode);
if (bBoxMap.values().size() == 1) {
Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH);
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(firstPage.getNumber() - 1).getColoredLines();
List<Line2D> lines = createLinesFromRectangle(r, firstPage.getRotation());
// add string to top line
var firstLine = lines.remove(0);
coloredLines.add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : lines) {
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
}
handleSinglePage(semanticNode, layoutGrid, color, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
return;
}
List<Page> pagesInOrder = bBoxMap.keySet()
@ -215,99 +221,203 @@ public class LayoutGridService {
.sorted(Comparator.comparingInt(Page::getNumber))
.collect(Collectors.toList());
pagesInOrder.remove(0);
addLinesForFirstPageOfSection(semanticNode, color, firstPage, layoutGrid);
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
addLinesForLastPageOfSection(semanticNode, color, lastPage, layoutGrid);
for (Page middlePage : pagesInOrder) {
addLinesForMiddlePageOfSection(semanticNode, color, middlePage, layoutGrid);
handleFirstPageOfSection(semanticNode, color, firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid, maxChildDepth, ownDepth);
if (semanticNode instanceof SuperSection) {
return;
}
for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) {
handleForMiddlePageOfSection(semanticNode, color, middlePage, bBoxMap.get(middlePage), treeIdString, layoutGrid, maxChildDepth, ownDepth);
}
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
handleLastPageOfSection(semanticNode, color, lastPage, bBoxMap.get(lastPage), treeIdString, layoutGrid, maxChildDepth, ownDepth);
}
@SneakyThrows
private void addPlacedText(Page page, Rectangle2D textBBox, String s, LayoutGrid layoutGrid) {
private void addPlacedText(Page page, Rectangle2D textBBox, Rectangle2D highestParentRect, String s, LayoutGrid layoutGrid, Integer maxChildDepth) {
// translates text, such that its right edge is a bit to the left of the drawn box
float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4);
float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + LINE_WIDTH + 2 * maxChildDepth);
Point2D upperLeftCorner;
Point2D translationVector;
switch (page.getRotation()) {
case 90 -> {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY());
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMinY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY());
}
translationVector = new Point2D.Double(FONT_SIZE, -translationAmount);
}
case 180 -> {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY());
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMinY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY());
}
translationVector = new Point2D.Double(translationAmount, FONT_SIZE);
}
case 270 -> {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY());
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMaxY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY());
}
translationVector = new Point2D.Double(-FONT_SIZE, translationAmount);
}
default -> {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY());
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMaxY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY());
}
translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE);
}
}
upperLeftCorner = add(upperLeftCorner, translationVector);
var placedTexts = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getPlacedTexts();
placedTexts.add(PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT));
}
List<PlacedText> placedTexts = layoutGrid.getVisualizationsPerPages()
.get(page.getNumber() - 1).getPlacedTexts();
PlacedText newText = PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT);
private void addLinesForMiddlePageOfSection(SemanticNode semanticNode, Color color, Page middlePage, LayoutGrid layoutGrid) {
Optional<PlacedText> conflictingText = placedTexts.stream()
.filter(pt -> Math.abs(pt.lineStart().getY() - newText.lineStart().getY()) <= FONT_SIZE)
.findFirst();
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(middlePage.getNumber() - 1).getColoredLines();
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(middlePage), LINE_WIDTH, LINE_WIDTH);
var midPageLines = createLinesFromRectangle(r, middlePage.getRotation());
// remove top line
midPageLines.remove(0);
// remove top line
midPageLines.remove(1);
// add string to left line
var leftLine = midPageLines.remove(1);
coloredLines.add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : midPageLines) {
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
if (conflictingText.isPresent()) {
PlacedText existingText = conflictingText.get();
if (newText.text().length() > existingText.text().length()) {
placedTexts.remove(existingText);
placedTexts.add(newText);
}
} else {
placedTexts.add(newText);
}
}
private void addLinesForLastPageOfSection(SemanticNode semanticNode, Color color, Page lastPage, LayoutGrid layoutGrid) {
private void handleSinglePage(SemanticNode semanticNode,
LayoutGrid layoutGrid,
Color color,
Page page,
Rectangle2D rectangle2D,
String treeIdString,
Integer maxChildDepth,
Integer ownDepth) {
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(lastPage.getNumber() - 1).getColoredLines();
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(lastPage), LINE_WIDTH, LINE_WIDTH);
var lastPageLines = createLinesFromRectangle(r, lastPage.getRotation());
// remove top line
lastPageLines.remove(0);
// add string to left line
var leftLine = lastPageLines.remove(2);
coloredLines.add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : lastPageLines) {
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void addLinesForFirstPageOfSection(SemanticNode semanticNode, Color color, Page firstPage, LayoutGrid layoutGrid) {
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(firstPage.getNumber() - 1).getColoredLines();
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(firstPage), LINE_WIDTH, LINE_WIDTH);
var firstPageLines = createLinesFromRectangle(r, firstPage.getRotation());
// remove bottom line
firstPageLines.remove(2);
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
// add string to top line
var firstLine = firstPageLines.remove(0);
coloredLines.add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : firstPageLines) {
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void handleFirstPageOfSection(SemanticNode semanticNode,
Color color,
Page firstPage,
Rectangle2D rectangle2D,
String treeIdString,
LayoutGrid layoutGrid,
Integer maxChildDepth,
Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
// remove bottom line
result.pageLines().remove(2);
// add string to top line
var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void handleForMiddlePageOfSection(SemanticNode semanticNode,
Color color,
Page middlePage,
Rectangle2D rectangle2D,
String treeIdString,
LayoutGrid layoutGrid,
Integer maxChildDepth,
Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
// remove top line
result.pageLines().remove(0);
// remove bottom line
result.pageLines().remove(1);
// add string to left line
var leftLine = result.pageLines().remove(1);
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void handleLastPageOfSection(SemanticNode semanticNode,
Color color,
Page lastPage,
Rectangle2D rectangle2D,
String treeIdString,
LayoutGrid layoutGrid,
Integer maxChildDepth,
Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
// remove top line
result.pageLines().remove(0);
// add string to left line
var leftLine = result.pageLines().remove(2);
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private RectangleAndLinesResult createLinesAndPlaceText(SemanticNode semanticNode,
Page page,
Rectangle2D rectangle2D,
String treeIdString,
LayoutGrid layoutGrid,
Integer maxChildDepth,
Integer ownDepth) {
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages()
.get(page.getNumber() - 1).getColoredLines();
int lineWidthModifier = maxChildDepth - ownDepth;
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
var lastPageLines = createLinesFromRectangle(r, page.getRotation());
SemanticNode highestParent = semanticNode.getHighestParent();
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, layoutGrid, maxChildDepth);
if (semanticNode instanceof SuperSection) {
rectangleMap.put(new RectangleIdentifier(semanticNode.getTreeId(), page.getNumber()), r);
}
return new RectangleAndLinesResult(coloredLines, r, lastPageLines);
}
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
}
private String buildTreeIdString(SemanticNode semanticNode) {
return semanticNode.getTreeId()
@ -365,7 +475,10 @@ public class LayoutGridService {
private static boolean isNotSectionOrTableCellOrDocument(SemanticNode semanticNode) {
return !(semanticNode.getType().equals(NodeType.DOCUMENT) || semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.TABLE_CELL));
return !(semanticNode.getType().equals(NodeType.DOCUMENT)
|| semanticNode.getType().equals(NodeType.SECTION)
|| semanticNode.getType().equals(NodeType.SUPER_SECTION)
|| semanticNode.getType().equals(NodeType.TABLE_CELL));
}
@ -373,9 +486,7 @@ public class LayoutGridService {
semanticNode.getBBox()
.forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages()
.get(page.getNumber() - 1)
.getColoredRectangles()
.add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
.get(page.getNumber() - 1).getColoredRectangles().add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
}

View File

@ -112,8 +112,8 @@ public class PdfVisualisationUtility {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case SUPER_SECTION, SECTION -> Color.BLACK;
case HEADLINE -> Color.RED;
case SECTION -> Color.BLACK;
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;

View File

@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
prepareStorage(layoutParsingRequest, file);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);

View File

@ -0,0 +1,231 @@
package com.knecon.fforesight.service.layoutparser.server;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.File;
import java.nio.file.Path;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.function.Predicate;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import lombok.SneakyThrows;
public class OutlineDetectionTest extends AbstractTest {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
@Autowired
protected LayoutParsingPipeline layoutParsingPipeline;
@Test
@SneakyThrows
public void testOutlinesToSections() {
String fileName = "files/new/crafted_outline_test_doc.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
long start = System.currentTimeMillis();
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(1).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(3).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(4).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(5).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(6).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(7).size(), 3);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(8).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(10).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(11).size(), 4);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(12).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(13).size(), 2);
assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values()
.stream()
.flatMap(Collection::stream)
.allMatch(OutlineObject::isFound));
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
assertEquals(tableOfContents.getMainSections().size(), 9);
assertEquals(tableOfContents.getMainSections().subList(1, 9)
.stream()
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
.toList(),
outlineObjectTree.getRootNodes()
.stream()
.map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle()))
.toList());
assertEquals(tableOfContents.getMainSections()
.get(5).getChildren().size(), 6);
assertEquals(tableOfContents.getMainSections()
.get(7).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren()
.get(0).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(0).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(6).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren()
.get(0).getChildren()
.get(2).getImages().size(), 1);
Document document = buildGraph(fileName, classificationDocument);
assertTrue(tableOfContents.getAllTableOfContentItems()
.stream()
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() != null));
assertTrue(tableOfContents.getAllTableOfContentItems()
.stream()
.filter(tableOfContentItem -> tableOfContentItem.getChildren().isEmpty())
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof Section));
assertTrue(tableOfContents.getAllTableOfContentItems()
.stream()
.filter(tableOfContentItem -> !tableOfContentItem.getChildren().isEmpty())
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof SuperSection));
List<SemanticNode> childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection();
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9);
assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9)
.stream()
.map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))
.toList(),
outlineObjectTree.getRootNodes()
.stream()
.map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle()))
.toList());
Predicate<SemanticNode> isSectionOrSuperSection = semanticNode -> semanticNode instanceof Section || semanticNode instanceof SuperSection;
assertEquals(childrenOfTypeSectionOrSuperSection.get(5).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 6 + 1); // 1 additional for main text of parent section
assertEquals(childrenOfTypeSectionOrSuperSection.get(7).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 3 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 3 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(3).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 1 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(3).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(1).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 3 + 1);
List<List<Integer>> imageTreeIdList = document.streamAllImages()
.map(image -> image.getParent().getTreeId())
.toList();
assertEquals(imageTreeIdList.get(0), List.of(0));
assertEquals(imageTreeIdList.get(1), List.of(6));
assertEquals(imageTreeIdList.get(2), List.of(8, 4, 2, 4));
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
private static String sanitizeString(String text) {
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
}
@SneakyThrows
protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) {
File fileResource = new ClassPathResource(filename).getFile();
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/crafted_outline_test_doc.IMAGE_INFO.json");
return layoutParsingPipeline.parseLayout(layoutParsingType,
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file", filename, "debug", "true"));
}
@SneakyThrows
protected Document buildGraph(String filename, ClassificationDocument classificationDocument) {
if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
new File(filename),
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get()),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
layoutParsingRequest.identifier()));
} else {
prepareStorage(filename);
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
}
}
}

View File

@ -64,8 +64,8 @@ public class SimplifiedTextServiceTest
@SneakyThrows
protected Document buildGraph(File file) {
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
file,
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -56,8 +56,8 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -32,12 +32,13 @@ public class ViewerDocumentTest extends BuildDocumentTest {
public void testViewerDocument() {
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}

View File

@ -37,8 +37,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
@ -56,12 +54,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
new VisualLayoutParsingResponse(),
Map.of("file", "document"));
}
@ -123,7 +122,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.get(0).getTextBlocks()
.get(0).toString()).contains(textToSearch);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument);
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue();
@ -134,6 +133,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@Test
@SneakyThrows
public void testTableAndCellRotations() {
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
@ -141,7 +141,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Disabled
@Test
public void testScanRotationBorderIsIgnored() throws IOException {
@ -151,15 +150,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
var tables = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList();
// Quality of the table parsing is not good, because the file is rotated at scanning.
@ -199,15 +202,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
@ -225,23 +232,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections()
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
@ -266,23 +279,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
TablePageBlock secondTable = document.getSections()
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
@ -307,23 +326,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections()
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
@ -818,10 +843,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
private void toHtml(ClassificationDocument document, String filename) {
var tables = document.getSections()
var tables = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList();
StringBuilder sb = new StringBuilder();
@ -843,12 +870,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections()
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream()
.flatMap(List::stream)
@ -870,10 +900,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
TablePageBlock table = document.getSections()
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
@ -896,10 +928,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTableSize(ClassificationDocument document, int tableSize) {
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.toList().size()).isEqualTo(tableSize);
}

View File

@ -93,6 +93,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
.toList();
for (String pdfFileName : pdfFileNames) {
writeJsons(Path.of(pdfFileName));
}
}
@ -102,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file",filename.toFile().toString())));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -48,8 +48,9 @@ public abstract class BuildDocumentTest extends AbstractTest {
@SneakyThrows
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
layoutParsingPipeline.parseLayout(layoutParsingType,

View File

@ -229,7 +229,7 @@ public class PdfDraw {
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case HEADLINE -> Color.RED;
case SECTION -> Color.BLACK;
case SECTION, SUPER_SECTION -> Color.BLACK;
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8afb731a307e1a3f827c59e902164b10bdabef96e14193b949fe081cd3aa859f
size 168878

View File

@ -0,0 +1,135 @@
{
"dossierId": "a91f19ff-11ba-4735-9f60-c650243f64a9",
"fileId": "6e8c5f114e2b71e103a32a20c5273188",
"targetFileExtension": "ORIGIN.pdf.gz",
"responseFileExtension": "IMAGE_INFO.json.gz",
"X-TENANT-ID": "redaction",
"data": [
{
"classification": {
"label": "other",
"probabilities": {
"other": 0.9126,
"formula": 0.0588,
"signature": 0.0261,
"logo": 0.0024
}
},
"representation": "70E1070C1030E081B7EF7FFFF",
"position": {
"x1": 61,
"x2": 394,
"y1": 155,
"y2": 470,
"pageNumber": 1
},
"geometry": {
"width": 333,
"height": 315
},
"alpha": false,
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.5976,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.0571,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"label": "other",
"probabilities": {
"other": 0.9126,
"formula": 0.0588,
"signature": 0.0261,
"logo": 0.0024
}
},
"representation": "70E1070C1030E081B7EF7FFFF",
"position": {
"x1": 61,
"x2": 394,
"y1": 202,
"y2": 517,
"pageNumber": 11
},
"geometry": {
"width": 333,
"height": 315
},
"alpha": false,
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.5976,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.0571,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"label": "other",
"probabilities": {
"other": 0.9126,
"formula": 0.0588,
"signature": 0.0261,
"logo": 0.0024
}
},
"representation": "70E1070C1030E081B7EF7FFFF",
"position": {
"x1": 47,
"x2": 379,
"y1": 289,
"y2": 604,
"pageNumber": 16
},
"geometry": {
"width": 332,
"height": 315
},
"alpha": false,
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.5967,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.054,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
}
]
}

View File

@ -52,17 +52,16 @@ public class ContentStreamUtility {
public static Optional<COSName> findLayoutGridOCGName(PDPage page) {
Optional<COSName> layoutGridOCGName = Optional.empty();
var resourceIterator = page.getResources().getPropertiesNames();
for (COSName cosName : resourceIterator) {
COSBase cosBase = page.getResources().getProperties(cosName).getCOSObject().getDictionaryObject(COSName.NAME);
if (cosBase instanceof COSString string) {
if (ContentStreams.KNECON_LAYOUT.name().equals(string.getString())) {
layoutGridOCGName = Optional.of(cosName);
return Optional.of(cosName);
}
}
}
return layoutGridOCGName;
return Optional.empty();
}