Compare commits

...

30 Commits

Author SHA1 Message Date
maverickstuder
0c8b2e6d44 RED-7074: Design Subsection section tree structure algorithm
* added abstract class SectionNode
* both Section and SuperSection extend the SectionNode class, so that there is no inheritance between Section and SuperSection as well as no field duplication
2024-05-22 13:02:16 +02:00
maverickstuder
b08ed2037e RED-7074: Design Subsection section tree structure algorithm
* fix pmd and checkstyle
2024-05-15 16:46:15 +02:00
maverickstuder
b50bfed69d RED-7074: Design Subsection section tree structure algorithm
* fix all failing tests
2024-05-15 16:40:57 +02:00
maverickstuder
49f13d1f03 RED-7074: Design Subsection section tree structure algorithm
* post rebase fixup
2024-05-15 15:09:31 +02:00
maverickstuder
61c90fc30d Merge branch 'main' into RED-7074
# Conflicts:
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java
#	layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java
2024-05-15 14:17:59 +02:00
maverickstuder
6a0661cf09 RED-7074: Design Subsection section tree structure algorithm
* bugfix
2024-05-15 13:51:49 +02:00
maverickstuder
2d33615b94 RED-7074: Design Subsection section tree structure algorithm
* added redactmanager logic for headline classification to documine and clarifynd
* refactored headline classification
* added supersection for non-leaf sections (containing other sections instead of only paragraphs, images, ...)
* bugfix for certain edge cases in some files running into error state
2024-05-15 10:29:39 +02:00
maverickstuder
1856fed640 RED-7074: Design Subsection section tree structure algorithm
* improved merging of headlines as well as splitting logic so that more headlines are detected correctly
2024-05-14 17:41:44 +02:00
maverickstuder
2fcaeb3d8c RED-7074: Design Subsection section tree structure algorithm
* added supersection and changed logic so that each normal section only contains leaf nodes
* added SectionIdentifier logic for headline splitting and merging
* fixed many edge cases which resulted in error state files
2024-05-14 10:51:05 +02:00
maverickstuder
4e07ba4ff1 RED-7074: Design Subsection section tree structure algorithm
* import optimized
2024-05-08 14:16:29 +02:00
maverickstuder
cfb6f0acfa RED-7074: Design Subsection section tree structure algorithm
* lots of refactoring to splitting logic for text blocks which resulted in some empty blocks to be created which can then not be localized (i.e. by containsBlock)
2024-05-08 14:15:27 +02:00
maverickstuder
a9338262c5 RED-7074: Design Subsection section tree structure algorithm
* fix for boundary error
2024-05-07 15:51:54 +02:00
maverickstuder
d2dc369df3 RED-7074: Design Subsection section tree structure algorithm
* temp
2024-05-07 14:25:54 +02:00
maverickstuder
f7aeb9a406 RED-7074: Design Subsection section tree structure algorithm
* refactoring
2024-05-02 10:36:36 +02:00
maverickstuder
9bf2f5c56c Merge remote-tracking branch 'origin/RED-7074' into RED-7074
# Conflicts:
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java
#	layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
#	layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf
2024-04-30 14:44:26 +02:00
maverickstuder
c071a133e6 RED-7074: Design Subsection section tree structure algorithm
* added toc enrichment logic and changed section computation to build upon created toc
2024-04-30 14:41:17 +02:00
maverickstuder
9f9ea68706 RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-29 15:00:49 +02:00
maverickstuder
85e3cf0ecc RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-29 15:00:49 +02:00
maverickstuder
17756f5977 RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-29 15:00:48 +02:00
maverickstuder
59d9d6c3e6 RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-29 15:00:34 +02:00
maverickstuder
c888746761 RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-29 15:00:34 +02:00
maverickstuder
7279d0a870 RED-7074: Design Subsection section tree structure algorithm
* first draft
2024-04-29 15:00:34 +02:00
maverickstuder
c84a199f9d RED-7074: Design Subsection section tree structure algorithm
* first draft
2024-04-29 15:00:32 +02:00
maverickstuder
09148960cf RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-19 11:31:34 +02:00
maverickstuder
77ee8dd5bd RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-18 17:52:33 +02:00
maverickstuder
e9d1bdc94f RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-17 14:31:48 +02:00
maverickstuder
894355c7cd RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-16 12:35:26 +02:00
maverickstuder
ca35feeb63 RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
2024-04-15 16:43:40 +02:00
maverickstuder
a32a43fc62 RED-7074: Design Subsection section tree structure algorithm
* first draft
2024-04-10 12:28:42 +02:00
maverickstuder
7f675b41cf RED-7074: Design Subsection section tree structure algorithm
* first draft
2024-04-09 16:53:57 +02:00
42 changed files with 2020 additions and 228 deletions

View File

@ -6,6 +6,7 @@ import java.util.Locale;
public enum NodeType implements Serializable {
DOCUMENT,
SECTION,
SUPER_SECTION,
HEADLINE,
PARAGRAPH,
TABLE,

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
@ -29,6 +30,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -45,6 +51,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
@ -90,12 +97,16 @@ public class LayoutParsingPipeline {
TableExtractionService tableExtractionService;
DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService;
BlockificationPostprocessingService blockificationPostprocessingService;
DocstrumBlockificationService docstrumBlockificationService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
ClarifyndClassificationService clarifyndClassificationService;
GraphicExtractorService graphicExtractorService;
OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
TOCEnrichmentService tocEnrichmentService;
LayoutparserSettings settings;
@ -105,21 +116,28 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
if (layoutParsingRequest.visualLayoutParsingFileId()
.isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
.get());
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
}
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
@ -199,15 +217,15 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -222,6 +240,7 @@ public class LayoutParsingPipeline {
PDDocument originDocument = openDocument(originFile);
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
@ -232,6 +251,12 @@ public class LayoutParsingPipeline {
}
List<ClassificationPage> classificationPages = new ArrayList<>();
OutlineObject lastProcessedOutlineObject = null;
// parsing the structure elements could be useful as well
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
}
long pageCount = originDocument.getNumberOfPages();
@ -277,7 +302,13 @@ public class LayoutParsingPipeline {
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
pdPage,
pageNumber,
cleanRulings,
stripper.getTextPositionSequences(),
false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
@ -301,6 +332,20 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
notFoundOutlineObject = lastProcessedOutlineObject;
}
if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects);
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
}
}
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
@ -342,13 +387,22 @@ public class LayoutParsingPipeline {
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
}
List<TextPageBlock> headlines = classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
classificationDocument.setTableOfContents(tableOfContents);
log.info("Building Sections for {}", identifier);
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> {
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
}
}

View File

@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
@ -28,4 +30,7 @@ public class ClassificationDocument {
private long rulesVersion;
private OutlineObjectTree outlineObjectTree;
private TableOfContents tableOfContents;
}

View File

@ -8,13 +8,13 @@ import java.util.Map;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
@Data
@RequiredArgsConstructor
@ -23,6 +23,10 @@ public class ClassificationPage {
@NonNull
private List<AbstractPageBlock> textBlocks;
private List<OutlineObject> outlineObjects = new ArrayList<>();
private List<AbstractPageBlock> headlines = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private Rectangle bodyTextFrame;

View File

@ -12,6 +12,7 @@ import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@Deprecated
public class ClassificationSection {
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();

View File

@ -31,6 +31,19 @@ public enum PageBlockType {
}
public static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1 -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
case H5 -> 5;
default -> 6;
};
}
public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);

View File

@ -8,6 +8,7 @@ import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@ -16,13 +17,15 @@ public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
private enum Format {
public enum Format {
EMPTY,
NUMERICAL,
DOCUMENT
}
@Getter
Format format;
@Getter
String identifierString;
List<Integer> identifiers;
boolean asChild;

View File

@ -140,8 +140,8 @@ public class DocumentTree {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
Entry entry = root;
for (int id : treeId) {
entry = entry.children.get(id);
}
return entry;

View File

@ -18,78 +18,20 @@ import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Section implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
return NodeType.SECTION;
}
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@EqualsAndHashCode(callSuper = true)
public class Section extends SectionNode {
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
return super.toString();
}
}

View File

@ -0,0 +1,103 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public abstract class SectionNode implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
return NodeType.SECTION;
}
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
public boolean isLeafSection() {
return streamAllSubNodesOfType(NodeType.SECTION).findAny()
.isEmpty();
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -0,0 +1,40 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.ToString;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class SuperSection extends SectionNode {
@Override
public NodeType getType() {
return NodeType.SUPER_SECTION;
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -50,14 +50,16 @@ public class ConcatenatedTextBlock implements TextBlock {
public ConcatenatedTextBlock concat(TextBlock textBlock) {
int start = textBlock.getBoundary().start();
int end = textBlock.getBoundary().end();
if (this.atomicTextBlocks.isEmpty()) {
boundary.setStart(textBlock.getBoundary().start());
boundary.setEnd(textBlock.getBoundary().end());
} else if (boundary.end() != textBlock.getBoundary().start()) {
boundary.setStart(start);
boundary.setEnd(end);
} else if (boundary.end() != start) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
boundary.setEnd(textBlock.getBoundary().end());
boundary.setEnd(end);
this.searchText = null;
return this;
}

View File

@ -0,0 +1,209 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitHeightDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitRectangleDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitWidthDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.springframework.stereotype.Service;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineExtractorService {
private static final String PDDESTINATION_TYPE_FIT = "Fit";
private static final String PDDESTINATION_TYPE_FIT_B = "FitB";
private static final String PDDESTINATION_TYPE_FIT_H = "FitH";
private static final String PDDESTINATION_TYPE_FIT_V = "FitV";
private static final String PDDESTINATION_TYPE_FIT_R = "FitR";
private static final String PDDESTINATION_TYPE_FIT_BH = "FitBH";
private static final String PDDESTINATION_TYPE_FIT_BV = "FitBV";
private static final String PDDESTINATION_TYPE_XYZ = "XYZ";
@SneakyThrows
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
if (documentOutline != null) {
for (PDOutlineItem child : documentOutline.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
outlineObjectWithChildren.ifPresent(rootNodes::add);
}
}
return new OutlineObjectTree(rootNodes);
}
@SneakyThrows
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
if (outlineObject.isPresent()) {
for (var child : item.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
}
}
return outlineObject;
}
// if the structure elements are processed beforehand, another case can be handled here as well:
// outline objects can reference structure elements (see pdf documentation)
@SneakyThrows
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
String title = item.getTitle();
PDPage page = item.findDestinationPage(document);
if (page == null) {
return Optional.empty();
}
int pageNumber = document.getPages().indexOf(page);
Optional<Point2D> outlinePosition = Optional.empty();
try {
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
PDDestinationNameTreeNode destinations = null;
if (names != null) {
destinations = names.getDests();
}
PDDestination destination = item.getDestination();
if (destination != null) {
outlinePosition = getLocationFromCOSBase(destinations, destination.getCOSObject());
}
if (outlinePosition.isEmpty()) {
PDAction action = item.getAction();
if (action != null) {
outlinePosition = extractOutlineLocationGoTo(destinations, action.getCOSObject());
}
}
} catch (Exception e) {
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
}
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth)));
}
@SneakyThrows
private static Optional<Point2D> extractOutlineLocationGoTo(PDDestinationNameTreeNode destinations, COSDictionary cosDictionary) {
if (isGoToAction(cosDictionary)) {
COSBase cosBase = cosDictionary.getItem(COSName.D);
return getLocationFromCOSBase(destinations, cosBase);
}
return Optional.empty();
}
private static Optional<Point2D> getLocationFromCOSBase(PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
if (cosBase != null) {
if (cosBase instanceof COSArray cosArray) {
return getLocationFromCosArray(cosArray);
}
if (cosBase instanceof COSString cosString) {
String destinationName = cosString.getString();
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
return getLocationFromCosArray(cosArray);
}
}
return Optional.empty();
}
private static Optional<Point2D> getLocationFromCosArray(COSArray cosArray) {
boolean located = false;
float x = 0;
float y = 0;
try {
PDDestination destination = PDDestination.create(cosArray);
COSName type = (COSName) cosArray.getObject(1);
String typeString = type.getName();
switch (typeString) {
case PDDESTINATION_TYPE_FIT_V:
case PDDESTINATION_TYPE_FIT_BV:
PDPageFitHeightDestination fitHeightDestination = (PDPageFitHeightDestination) destination;
x = fitHeightDestination.getLeft();
located = true;
break;
case PDDESTINATION_TYPE_FIT_R:
PDPageFitRectangleDestination fitRectangleDestination = (PDPageFitRectangleDestination) destination;
x = fitRectangleDestination.getLeft();
y = fitRectangleDestination.getTop();
located = true;
break;
case PDDESTINATION_TYPE_FIT_H:
case PDDESTINATION_TYPE_FIT_BH:
PDPageFitWidthDestination fitWidthDestination = (PDPageFitWidthDestination) destination;
y = fitWidthDestination.getTop();
located = true;
break;
case PDDESTINATION_TYPE_XYZ:
PDPageXYZDestination xyzDestination = (PDPageXYZDestination) destination;
x = xyzDestination.getLeft();
y = xyzDestination.getTop();
located = true;
break;
case PDDESTINATION_TYPE_FIT:
case PDDESTINATION_TYPE_FIT_B:
default:
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return located ? Optional.of(new Point2D.Float(x, y)) : Optional.empty();
}
private static boolean isGoToAction(COSDictionary cosDictionary) {
return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto");
}
}

View File

@ -0,0 +1,35 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
@AllArgsConstructor
public class OutlineObject {
private final String title;
private final int pageNumber;
private Point2D point;
private final int treeDepth;
private boolean found;
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
this(title, pageNumber, depth);
this.point = point2D;
}
@Override
public String toString() {
return "OutlineObject{" + "title='" + title + '\'' + '}';
}
}

View File

@ -0,0 +1,42 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class OutlineObjectTree {
private List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
private Map<Integer, List<OutlineObject>> outlineObjectsPerPage = new HashMap<>();
public OutlineObjectTree(List<OutlineObjectTreeNode> rootNodes) {
this.rootNodes = rootNodes;
flattenNodesAndGroupByPage(rootNodes);
}
private void flattenNodesAndGroupByPage(List<OutlineObjectTreeNode> outlineObjectTreeNodes) {
for (OutlineObjectTreeNode node : outlineObjectTreeNodes) {
int pageNumber = node.getOutlineObject().getPageNumber();
if (!this.outlineObjectsPerPage.containsKey(pageNumber)) {
outlineObjectsPerPage.put(pageNumber, new ArrayList<>());
}
outlineObjectsPerPage.get(pageNumber).add(node.getOutlineObject());
if (!node.getChildren().isEmpty()) {
flattenNodesAndGroupByPage(node.getChildren());
}
}
}
}

View File

@ -0,0 +1,34 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data
public class OutlineObjectTreeNode {
private OutlineObject outlineObject;
private List<OutlineObjectTreeNode> children = new ArrayList<>();
public OutlineObjectTreeNode(OutlineObject outlineObject) {
this.outlineObject = outlineObject;
}
public void addChild(OutlineObjectTreeNode outlineObject) {
children.add(outlineObject);
}
@Override
public String toString() {
return "OutlineObjectTreeNode{" + "outlineObject=" + outlineObject + '}';
}
}

View File

@ -0,0 +1,59 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineValidationService {
public TableOfContents createToC(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
TableOfContentItem last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new TableOfContentItem(current);
if (parentDepth == null) {
mainSections.add(tocItem);
} else {
assert last != null;
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
}
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
last = tocItem;
lastItemsPerDepth.put(currentDepth, tocItem);
depths.add(currentDepth);
}
return new TableOfContents(mainSections);
}
}

View File

@ -0,0 +1,261 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class TOCEnrichmentService {
public void assignSectionBlocksAndImages(ClassificationDocument document) {
TableOfContents toc = document.getTableOfContents();
Iterator<TableOfContentItem> iterator = toc.iterator();
TableOfContentItem currentTOCItem = null;
if(iterator.hasNext()) {
currentTOCItem = iterator.next();
}
List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>();
TableOfContentItem currentSection = null;
boolean foundFirstHeadline = false;
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null;
List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>();
for (ClassificationPage page : document.getPages()) {
List<TableOfContentItem> currentPageTOCItems = new ArrayList<>();
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
if (current.getClassification() == null) {
continue;
}
current.setPage(page.getPageNumber());
if (current.getClassification().equals(PageBlockType.HEADER)) {
header.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals(PageBlockType.FOOTER)) {
footer.add((TextPageBlock) current);
continue;
}
if (current instanceof TablePageBlock table) {
if (previousTable != null) {
mergeTableMetadata(table, previousTable);
}
previousTable = table;
}
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
if (!foundFirstHeadline) {
foundFirstHeadline = true;
}
currentSection = currentTOCItem;
currentTOCItem.getSectionBlocks().add(current);
currentPageTOCItems.add(currentTOCItem);
if(iterator.hasNext()) {
currentTOCItem = iterator.next();
}
}
if (!foundFirstHeadline) {
startBlocks.add(current);
} else {
currentSection.getSectionBlocks().add(current);
}
}
if (!currentPageTOCItems.isEmpty()) {
lastFoundTOCItems = currentPageTOCItems;
}
for (ClassifiedImage image : page.getImages()) {
Double xMin = null;
Double yMin = null;
Double xMax = null;
Double yMax = null;
for (TableOfContentItem tocItem : lastFoundTOCItems) {
var headline = tocItem.getHeadline();
if (headline.getPage() != page.getPageNumber()) {
continue;
}
if (headline.getMinX() < headline.getMaxX()) {
if (xMin == null || headline.getMinX() < xMin) {
xMin = headline.getMinX();
}
if (xMax == null || headline.getMaxX() > xMax) {
xMax = headline.getMaxX();
}
} else {
if (xMin == null || headline.getMaxX() < xMin) {
xMin = headline.getMaxX();
}
if (xMax == null || headline.getMinX() > xMax) {
xMax = headline.getMinX();
}
}
if (headline.getMinY() < headline.getMaxY()) {
if (yMin == null || headline.getMinY() < yMin) {
yMin = headline.getMinY();
}
if (yMax == null || headline.getMaxY() > yMax) {
yMax = headline.getMaxY();
}
} else {
if (yMin == null || headline.getMaxY() < yMin) {
yMin = headline.getMaxY();
}
if (yMax == null || headline.getMinY() > yMax) {
yMax = headline.getMinY();
}
}
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
tocItem.getImages().add(image);
image.setAppendedToSection(true);
break;
}
}
if (!image.isAppendedToSection()) {
log.debug("Image uses first paragraph");
if (!lastFoundTOCItems.isEmpty()) {
lastFoundTOCItems.get(0).getImages().add(image);
} else {
startImages.add(image);
}
image.setAppendedToSection(true);
}
}
if (!header.isEmpty()) {
headers.add(new ClassificationHeader(header));
}
if (!footer.isEmpty()) {
footers.add(new ClassificationFooter(footer));
}
}
if (!startBlocks.isEmpty()) {
TableOfContentItem unassigned = new TableOfContentItem(null);
unassigned.setSectionBlocks(startBlocks);
unassigned.setImages(startImages);
document.getTableOfContents().getMainSections().add(0, unassigned);
}
document.setHeaders(headers);
document.setFooters(footers);
}
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty()
&& previousTable.getRowCount() == 1
&& previousTable.getRows()
.get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows()
.get(0)
.stream()
.map(cell -> {
Cell fakeCell = Cell.copy(cell);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})
.toList();
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows()
.get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
}
private boolean hasValidHeaderInformation(TablePageBlock table) {
return !hasInvalidHeaderInformation(table);
}
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows()
.stream()
.flatMap(row -> row.stream()
.filter(cell -> !cell.getHeaderCells().isEmpty()))
.findAny().isEmpty();
}
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows()
.get(i);
if (row.size() == 1) {
continue;
}
boolean allNonHeader = true;
for (Cell cell : row) {
if (cell.isHeaderCell()) {
allNonHeader = false;
break;
}
}
if (allNonHeader) {
return row;
}
}
return Collections.emptyList();
}
}

View File

@ -0,0 +1,110 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TableOfContentItem {
@EqualsAndHashCode.Include
private TextPageBlock headline;
private List<TableOfContentItem> children = new ArrayList<>();
private TableOfContentItem parent;
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private SectionNode section;
public TableOfContentItem(TextPageBlock headline) {
this.headline = headline;
}
public void addChild(TableOfContentItem tableOfContentItem) {
children.add(tableOfContentItem);
tableOfContentItem.setParent(this);
}
public TableOfContentItem getSiblingBefore() {
if (parent != null) {
int index = parent.getChildren().indexOf(this);
if (index > 0) {
return parent.getChildren()
.get(index - 1);
}
}
return null;
}
public TableOfContentItem getSiblingAfter() {
if (parent != null) {
int index = parent.getChildren().indexOf(this);
if (index >= 0 && index < parent.getChildren().size() - 1) {
return parent.getChildren()
.get(index + 1);
}
}
return null;
}
public boolean contains(TextPageBlock block) {
if (headline.equals(block)) {
return true;
}
for (TableOfContentItem child : children) {
if (child.contains(block)) {
return true;
}
}
return false;
}
public boolean contains(TableOfContentItem tocItem) {
if (this.equals(tocItem)) {
return true;
}
for (TableOfContentItem child : children) {
if (child.contains(tocItem)) {
return true;
}
}
return false;
}
public List<AbstractPageBlock> getNonEmptySectionBlocks() {
return sectionBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList());
}
@Override
public String toString() {
return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}';
}
}

View File

@ -0,0 +1,136 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;
import org.springframework.lang.NonNull;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class TableOfContents implements Iterable<TableOfContentItem> {
private List<TableOfContentItem> mainSections = new ArrayList<>();
public TableOfContents(List<TableOfContentItem> mainSections) {
this.mainSections = mainSections;
}
public List<TextPageBlock> getAllTextPageBlocks() {
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
for (TableOfContentItem item : mainSections) {
collectTextPageBlocks(item, allTextPageBlocks);
}
return allTextPageBlocks;
}
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
textPageBlocks.add(item.getHeadline());
for (TableOfContentItem child : item.getChildren()) {
collectTextPageBlocks(child, textPageBlocks);
}
}
public List<TableOfContentItem> getAllTableOfContentItems() {
List<TableOfContentItem> allItems = new ArrayList<>();
for (TableOfContentItem item : mainSections) {
collectTableOfContentItems(item, allItems);
}
return allItems;
}
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
allItems.add(item);
for (TableOfContentItem child : item.getChildren()) {
collectTableOfContentItems(child, allItems);
}
}
private boolean containsBlock(TextPageBlock block) {
for (TableOfContentItem existingItem : this.getMainSections()) {
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
return true;
}
}
return false;
}
private boolean containsItem(TableOfContentItem tocItem) {
for (TableOfContentItem existingItem : this.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true;
}
}
return false;
}
@Override
public @NonNull Iterator<TableOfContentItem> iterator() {
return new TableOfContentItemIterator(mainSections);
}
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
stack.push(mainSections.iterator());
}
@Override
public boolean hasNext() {
ensureStackTopIsCurrent();
return !stack.isEmpty() && stack.peek().hasNext();
}
@Override
public TableOfContentItem next() {
ensureStackTopIsCurrent();
TableOfContentItem currentItem = stack.peek().next();
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
stack.push(currentItem.getChildren()
.iterator());
}
return currentItem;
}
private void ensureStackTopIsCurrent() {
while (!stack.isEmpty() && !stack.peek().hasNext()) {
stack.pop();
}
}
}
}

View File

@ -58,6 +58,20 @@ public class TextPageBlock extends AbstractPageBlock {
}
@JsonIgnore
public float getPageHeight() {
return sequences.get(0).getPageHeight();
}
@JsonIgnore
public float getPageWidth() {
return sequences.get(0).getPageWidth();
}
private void calculateBBox() {
if (sequences == null) {
@ -69,6 +83,12 @@ public class TextPageBlock extends AbstractPageBlock {
}
public void recalculateBBox() {
calculateBBox();
}
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
if (textBlocksToMerge.isEmpty()) {

View File

@ -27,8 +27,10 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@Deprecated
public class SectionsBuilderService {
public void buildSections(ClassificationDocument document) {
List<AbstractPageBlock> chunkWords = new ArrayList<>();

View File

@ -0,0 +1,525 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
@Service
public class BlockificationPostprocessingService {
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
.collect(RectangleTransformations.collectBBox()))
.collect(RectangleTransformations.collectBBox());
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) {
return null;
}
float pageHeight = classificationPage.getPageHeight();
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
if (notFoundOutlineObject != null) {
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
OutlineObject firstOutlineObject = null;
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
if (outlineObjectListIterator.hasNext()) {
firstOutlineObject = outlineObjectListIterator.next();
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
}
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
}
if (firstOutlineObject != null) {
// re-create the context for the updated blocks
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
}
}
outlineObjectListIterator.forEachRemaining(outlineObject -> {
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
});
if (!outlineObjects.isEmpty()) {
return outlineObjects.get(outlineObjects.size() - 1);
} else {
return notFoundOutlineObject;
}
}
private static List<TextPageBlock> getTextPageBlocks(ClassificationPage classificationPage) {
return classificationPage.getTextBlocks()
.stream()
.filter(block -> block instanceof TextPageBlock)
.map(block -> (TextPageBlock) block)
.toList();
}
private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) {
if (firstOutlineObjectProcessionContext == null) {
return false;
}
String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle();
String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle();
if (!firstTitle.startsWith(notFoundTitle)) {
return false;
}
var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext);
var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext);
double maxYFirst = blocksOfFirstOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY)
.max()
.orElse(Double.NEGATIVE_INFINITY);
return blocksOfNotFoundOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY)
.anyMatch(y -> y >= maxYFirst);
}
private List<TextPageBlock> getAllMatchingBlocks(OutlineProcessionContext context) {
List<TextPageBlock> blocks = new ArrayList<>();
if (context.getDirectMatch() != null) {
blocks.add(context.getDirectMatch());
}
if (context.getSplitCandidate() != null) {
blocks.add(context.getSplitCandidate());
}
blocks.addAll(context.getMergeCandidates());
return blocks;
}
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
OutlineObject outlineObject = context.getOutlineObject();
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
while (iterator.hasNext()) {
TextPageBlock pageBlock = iterator.next();
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
break;
}
}
if (iterator.hasPrevious()) {
iterator.previous();
}
boolean earlyStop = false;
while (iterator.hasNext() && !earlyStop) {
TextPageBlock pageBlock = iterator.next();
earlyStop = processOutlineForTextBlock(pageBlock, context);
}
}
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
OutlineObject outlineObject = context.outlineObject;
TextPageBlock directMatch = context.directMatch;
List<TextPageBlock> mergeCandidates = context.mergeCandidates;
TextPageBlock splitCandidate = context.splitCandidate;
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
double distanceToBestMergeCandidates = Double.MAX_VALUE;
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
if (!mergeCandidates.isEmpty()) {
// with this code adjacent blocks to the first and last merge candidate get added, this could be useful for some edge cases:
//List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
//addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
//if (mergeCandidates.size() > 1) {
// addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
//}
//allMergeCandidates = allMergeCandidates.stream()
// .distinct()
// .toList();
List<List<TextPageBlock>> combinations = findCombinations(outlineObject.getTitle(), mergeCandidates);
for (List<TextPageBlock> combination : combinations) {
double averageDistance = combination.stream()
.map(block -> calculateDistance(outlineObject, block))
.mapToDouble(Double::doubleValue).average()
.orElse(Double.MAX_VALUE);
if (distanceToBestMergeCandidates > averageDistance) {
distanceToBestMergeCandidates = averageDistance;
bestMergeCandidateCombination = combination;
}
}
}
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
if (minDistance == Double.MAX_VALUE) {
return false;
}
if (minDistance == distanceToDirectMatch) {
directMatch.setClassification(headlineType);
} else if (minDistance == distanceToSplitCandidate) {
SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
if (splitBlockResult.modifiedBlockToSplit) {
splitCandidate.setClassification(headlineType);
}
splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null));
} else {
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
merged.setClassification(headlineType);
}
return true;
}
private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
List<TextPageBlock> otherBlocks = new ArrayList<>();
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
String headline = title;
if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) {
headline = sectionIdentifier + headline;
}
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) {
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
}
boolean modifiedBlockToSplit = false;
if (!wordSequenceResult.inSequence.isEmpty()) {
blockToSplit.setSequences(wordSequenceResult.inSequence);
blockToSplit.recalculateBBox();
modifiedBlockToSplit = true;
}
if (!wordSequenceResult.preSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
classificationPage.getTextBlocks().add(blockToSplitIdx, block);
otherBlocks.add(block);
blockToSplitIdx++;
}
if (!wordSequenceResult.postSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.postSequence, 0);
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
otherBlocks.add(block);
}
return new SplitBlockResult(modifiedBlockToSplit, otherBlocks);
}
private static WordSequenceResult findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
String target = sanitizeString(text);
List<TextPositionSequence> inSequence = new ArrayList<>();
List<TextPositionSequence> preSequence = new ArrayList<>();
List<TextPositionSequence> postSequence = new ArrayList<>();
StringBuilder currentSequence = new StringBuilder();
for (TextPositionSequence sequence : textPositionSequences) {
currentSequence.append(sanitizeString(sequence.toString()));
inSequence.add(sequence);
if (currentSequence.length() >= target.length()) {
if (currentSequence.toString().endsWith(target)) {
int index = 0;
String toRemove = currentSequence.substring(0, currentSequence.length() - target.length());
TextPositionSequence next = inSequence.get(index);
while (currentSequence.length() - next.length() >= target.length()) {
TextPositionSequence removed = inSequence.remove(index);
currentSequence.delete(0, removed.toString().length());
preSequence.add(removed);
next = inSequence.get(index);
toRemove = toRemove.substring(removed.length());
}
if (!toRemove.isEmpty()) {
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
currentSequence.delete(0, splitSequenceResult.out.length());
preSequence.add(splitSequenceResult.out);
inSequence.add(index, splitSequenceResult.in);
}
} else if (currentSequence.toString().startsWith(target)) {
int index = inSequence.size() - 1;
String toRemove = currentSequence.substring(target.length());
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
currentSequence.delete(currentSequence.length() - splitSequenceResult.out.length(), currentSequence.length());
inSequence.add(index, splitSequenceResult.in);
postSequence.add(splitSequenceResult.out);
}
if (currentSequence.toString().equals(target)) {
postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size()));
return new WordSequenceResult(inSequence, preSequence, postSequence);
}
}
}
return new WordSequenceResult();
}
private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) {
TextPositionSequence in = null;
TextPositionSequence out;
String currentSequence = sequence.toString();
int index = currentSequence.indexOf(toRemove);
int endIndex = index + toRemove.length();
out = createSubSequence(sequence, index, endIndex);
if (index > 0) {
in = createSubSequence(sequence, 0, index);
} else if (endIndex < sequence.getTextPositions().size()) {
in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size());
}
return new SplitSequenceResult(in, out);
}
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
newSeq.setParagraphStart(sequence.isParagraphStart());
return newSeq;
}
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
TextPageBlock firstBlock = blocksToMerge.get(0);
if (blocksToMerge.size() > 1) {
List<TextPageBlock> mergedBlocks = new ArrayList<>();
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
if (textPageBlock.getDir() == firstBlock.getDir()) {
firstBlock.getSequences().addAll(textPageBlock.getSequences());
mergedBlocks.add(textPageBlock);
}
}
}
assert firstBlock != null;
firstBlock.setToDuplicate(false);
firstBlock.recalculateBBox();
classificationPage.getTextBlocks().removeAll(mergedBlocks);
}
return firstBlock;
}
private static List<List<TextPageBlock>> findCombinations(String title, List<TextPageBlock> blocks) {
List<List<TextPageBlock>> combinations = new ArrayList<>();
findCombinations(title, blocks, new ArrayList<>(), combinations);
return combinations;
}
private static void findCombinations(String title, List<TextPageBlock> blocks, List<TextPageBlock> current, List<List<TextPageBlock>> combinations) {
String target = sanitizeString(title);
if (target.isEmpty()) {
combinations.add(new ArrayList<>(current));
return;
}
List<TextPageBlock> remaining = blocks.stream()
.filter(block -> !current.contains(block))
.toList();
for (TextPageBlock block : remaining) {
String prefix = sanitizeString(block.getText());
if (target.startsWith(prefix)) {
current.add(block);
findCombinations(target.substring(prefix.length()), blocks.subList(blocks.indexOf(block) + 1, blocks.size()), current, combinations);
current.remove(current.size() - 1);
}
}
}
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY();
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
}
// currently only three cases are handled here:
// 1. equality
// 2. outline title contains block text
// 3. block text contains outline title
// another possible case is an intersection, meaning a title is split up between two different blocks
// this should not happen with how docstrum creates the blocks
// if it is indeed necessary, a splitting has to be done with a follow-up merge
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
OutlineObject outlineObject = context.getOutlineObject();
String blockText = sanitizeString(pageBlock.getText());
String outlineTitle = sanitizeString(outlineObject.getTitle());
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
if (!blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
return false;
}
if (blockText.equals(outlineTitle) && context.directMatch == null) {
context.directMatch = pageBlock;
return true;
}
if (outlineTitleContainsBlockText) {
context.mergeCandidates.add(pageBlock);
}
if (blockTextContainsOutlineTitle) {
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) {
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
context.directMatch = pageBlock;
return true;
} else if (context.splitCandidate == null) {
context.sectionIdentifier = sectionIdentifier;
}
}
if (context.splitCandidate == null) {
context.splitCandidate = pageBlock;
}
}
return false;
}
private static String sanitizeString(String text) {
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
}
@Data
private static class OutlineProcessionContext {
private TextPageBlock directMatch;
private OutlineObject outlineObject;
private List<TextPageBlock> mergeCandidates;
private TextPageBlock splitCandidate;
private SectionIdentifier sectionIdentifier;
OutlineProcessionContext(OutlineObject outlineObject) {
this.outlineObject = outlineObject;
this.directMatch = null;
this.mergeCandidates = new ArrayList<>();
this.splitCandidate = null;
this.sectionIdentifier = SectionIdentifier.empty();
}
}
public static class WordSequenceResult {
public List<TextPositionSequence> inSequence;
public List<TextPositionSequence> preSequence;
public List<TextPositionSequence> postSequence;
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
this.inSequence = inSequence;
this.preSequence = preSequence;
this.postSequence = postSequence;
}
public WordSequenceResult() {
this.inSequence = new ArrayList<>();
this.preSequence = new ArrayList<>();
this.postSequence = new ArrayList<>();
}
}
public record SplitBlockResult(boolean modifiedBlockToSplit, List<TextPageBlock> otherBlocks) {
}
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
}
}

View File

@ -126,6 +126,16 @@ public class DocstrumBlockificationService {
continue;
}
if (current.isHeadline() || previous.isHeadline()) {
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, false);
} else {
previous = current;
}
continue;
}
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, true);
continue;
@ -172,6 +182,12 @@ public class DocstrumBlockificationService {
}
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
}
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
@ -185,6 +201,9 @@ public class DocstrumBlockificationService {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate);
if (current.getClassification() != null && previous.getClassification() == null) {
previous.setClassification(current.getClassification());
}
itty.remove();
itty.previous();
itty.set(previous);
@ -244,21 +263,30 @@ public class DocstrumBlockificationService {
continue;
}
if (block.getClassification() != null && block.getClassification().isHeadline()) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
for (int i = 0; i < blocks.size(); i++) {
if (blocks.get(i) == null) {
AbstractPageBlock abstractPageBlock = blocks.get(i);
if (abstractPageBlock == null) {
continue;
}
if (blocks.get(i) == current) {
if (abstractPageBlock == current) {
continue;
}
if (blocks.get(i) instanceof TablePageBlock) {
if (abstractPageBlock instanceof TablePageBlock) {
continue;
}
TextPageBlock inner = (TextPageBlock) blocks.get(i);
if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
continue;
}
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
if (usedRulings.lineBetween(current, blocks.get(i))) {
continue;
@ -285,7 +313,7 @@ public class DocstrumBlockificationService {
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
return new TextPageBlock(wordBlockList);
}

View File

@ -21,12 +21,16 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class ClarifyndClassificationService {
private final HeadlineClassificationService headlineClassificationService;
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
headlineClassificationService.resetContext();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
}
@ -47,6 +51,10 @@ public class ClarifyndClassificationService {
var bodyTextFrame = page.getBodyTextFrame();
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
@ -79,7 +87,8 @@ public class ClarifyndClassificationService {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
}
}
@ -89,7 +98,8 @@ public class ClarifyndClassificationService {
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {

View File

@ -24,6 +24,7 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class DocuMineClassificationService {
private final HeadlineClassificationService headlineClassificationService;
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
@ -35,6 +36,8 @@ public class DocuMineClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
headlineClassificationService.resetContext();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
}
@ -60,6 +63,10 @@ public class DocuMineClassificationService {
Matcher matcher2 = pattern2.matcher(textBlock.toString());
Matcher matcher3 = pattern3.matcher(textBlock.toString());
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
@ -95,6 +102,7 @@ public class DocuMineClassificationService {
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
.contains(":")
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
@ -103,11 +111,13 @@ public class DocuMineClassificationService {
|| textBlock.toString().startsWith("TABLE"))
&& !textBlock.toString().endsWith(":")
&& matcher2.reset().find()) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()

View File

@ -0,0 +1,62 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Getter;
import lombok.Setter;
@Service
@Getter
@Setter
public class HeadlineClassificationService {
TextPageBlock lastHeadline;
PageBlockType originalClassifiedBlockType;
TextPageBlock lastHeadlineFromOutline;
public void resetContext() {
setLastHeadline(null);
setOriginalClassifiedBlockType(null);
setLastHeadlineFromOutline(null);
}
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
this.setLastHeadline(lastHeadlineFromOutline);
}
public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) {
TextPageBlock lastHeadline = getLastHeadline();
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
PageBlockType finalHeadlineType = initialHeadlineType;
if (lastHeadline != null) {
if (lastHeadline.equals(lastHeadlineFromOutline)) {
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
PageBlockType lastHeadlineType = lastHeadline.getClassification();
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference);
}
}
setOriginalClassifiedBlockType(initialHeadlineType);
textBlock.setClassification(finalHeadlineType);
setLastHeadline(textBlock);
}
}

View File

@ -22,12 +22,17 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RedactManagerClassificationService {
private final HeadlineClassificationService headlineClassificationService;
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
headlineClassificationService.resetContext();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
}
@ -48,6 +53,10 @@ public class RedactManagerClassificationService {
var bodyTextFrame = page.getBodyTextFrame();
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
@ -60,58 +69,64 @@ public class RedactManagerClassificationService {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
&& (textBlock.getMostPopularWordStyle().equals("bold")
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
&& textBlock.getSequences()
.get(0).getTextPositions()
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
} else if (!textBlock.getText().startsWith("Figure ")
&& PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
&& textBlock.getSequences()
.get(0).getTextPositions()
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("italic")
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);

View File

@ -11,6 +11,7 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
@ -31,8 +32,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@ -57,11 +60,6 @@ public class DocumentGraphFactory {
document.getPages()
.forEach(context::buildAndAddPageWithCounter);
document.getSections()
.stream()
.flatMap(section -> section.getImages()
.stream())
.forEach(image -> context.getImages().add(image));
addSections(layoutParsingType, document, context, documentGraph);
addHeaderAndFooterToEachPage(document, context);
@ -75,8 +73,17 @@ public class DocumentGraphFactory {
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
classificationDocument.getSections()
.forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
Optional<SectionNode> section = SectionNodeFactory.addSection(layoutParsingType,
parent,
tocItem.getChildren().isEmpty(),
tocItem.getNonEmptySectionBlocks(),
tocItem.getImages(),
context,
document);
tocItem.setSection(section.orElse(null));
}
}
@ -181,10 +188,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
footer,
context,
page);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
@ -236,7 +240,7 @@ public class DocumentGraphFactory {
DocumentTree documentTree;
Map<Page, Integer> pages;
List<Section> sections;
List<SectionNode> sections;
List<ClassifiedImage> images;
TextBlockFactory textBlockFactory;

View File

@ -9,6 +9,7 @@ import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -17,6 +18,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -27,12 +30,13 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class SectionNodeFactory {
public void addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
Document document) {
public Optional<SectionNode> addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
boolean isLeaf,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
Document document) {
// This is for the case where we have images on a page without any text/footer/header.
// The pageBlocks list is empty, but we still need to add those images to the document.
@ -40,16 +44,22 @@ public class SectionNodeFactory {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
return;
return Optional.empty();
}
if (pageBlocks.isEmpty()) {
return;
return Optional.empty();
}
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
SectionNode section;
if (isLeaf) {
section = Section.builder().documentTree(context.getDocumentTree()).build();
} else {
section = SuperSection.builder().documentTree(context.getDocumentTree()).build();
}
context.getSections().add(section);
blocksPerPage.keySet()
@ -59,12 +69,24 @@ public class SectionNodeFactory {
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
if (containsTablesAndTextBlocks(pageBlocks)) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
section,
true,
subSectionPageBlocks,
emptyList(),
context,
document));
} else if (!isLeaf) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
} else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
}
@ -72,10 +94,12 @@ public class SectionNodeFactory {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
return Optional.of(section);
}
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, SectionNode section) {
if (parentNode == null) {
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
@ -88,7 +112,7 @@ public class SectionNodeFactory {
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
Section section,
SectionNode section,
Document document) {
if (pageBlocks.get(0).isHeadline()) {
@ -101,7 +125,7 @@ public class SectionNodeFactory {
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
Section section,
SectionNode section,
Document document) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
@ -226,7 +250,7 @@ public class SectionNodeFactory {
}
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, SectionNode section, Integer pageNumber) {
Page page = context.getPage(pageNumber);
page.getMainBody().add(section);

View File

@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
@ -154,10 +155,11 @@ public class TableNodeFactory {
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType,
tableCell,
true,
cell.getTextBlocks()
.stream()
.map(tb -> (AbstractPageBlock) tb)
.toList(),
.collect(Collectors.toList()),
emptyList(),
context,
document);

View File

@ -61,7 +61,7 @@ public class DocumentGraphMapper {
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case SECTION, SUPER_SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);

View File

@ -193,10 +193,11 @@ public class LayoutGridService {
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
.toList();
Page firstPage = semanticNode.getFirstPage();
String treeIdString = buildTreeIdString(semanticNode);
if (!subSections.isEmpty()) {
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid);
addPlacedText(firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid);
} else {
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, buildTreeIdString(semanticNode), layoutGrid)));
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, treeIdString, layoutGrid)));
}
if (bBoxMap.values().size() == 1) {
Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH);

View File

@ -112,8 +112,8 @@ public class PdfVisualisationUtility {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case SUPER_SECTION, SECTION -> Color.BLACK;
case HEADLINE -> Color.RED;
case SECTION -> Color.BLACK;
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;

View File

@ -32,6 +32,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
public void testViewerDocument() {
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();

View File

@ -37,8 +37,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
@ -62,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
tableServiceResponse,
new VisualLayoutParsingResponse(),
Map.of("file", "document"));
}
@ -134,6 +133,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@Test
@SneakyThrows
public void testTableAndCellRotations() {
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
@ -141,7 +141,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Disabled
@Test
public void testScanRotationBorderIsIgnored() throws IOException {
@ -151,15 +150,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
var tables = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList();
// Quality of the table parsing is not good, because the file is rotated at scanning.
@ -199,15 +202,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
@ -225,23 +232,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections()
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
@ -266,23 +279,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
TablePageBlock secondTable = document.getSections()
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
@ -307,23 +326,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections()
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
@ -818,10 +843,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
private void toHtml(ClassificationDocument document, String filename) {
var tables = document.getSections()
var tables = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList();
StringBuilder sb = new StringBuilder();
@ -843,12 +870,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections()
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream()
.flatMap(List::stream)
@ -870,10 +900,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
TablePageBlock table = document.getSections()
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
@ -896,10 +928,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTableSize(ClassificationDocument document, int tableSize) {
assertThat(document.getSections()
assertThat(document.getTableOfContents().getAllTableOfContentItems()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.toList().size()).isEqualTo(tableSize);
}

View File

@ -93,6 +93,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
.toList();
for (String pdfFileName : pdfFileNames) {
writeJsons(Path.of(pdfFileName));
}
}
@ -102,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file",filename.toFile().toString())));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -229,7 +229,7 @@ public class PdfDraw {
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case HEADLINE -> Color.RED;
case SECTION -> Color.BLACK;
case SECTION, SUPER_SECTION -> Color.BLACK;
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;