Compare commits

..

1 Commits

Author SHA1 Message Date
Kilian Schuettler
5fd0980d9d RED-9353: implement ViewerDocService in PDFTron 2024-06-28 17:57:35 +02:00
246 changed files with 8276 additions and 8155 deletions

View File

@ -21,6 +21,5 @@ deploy:
dotenv: version.env dotenv: version.env
rules: rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^feature/ && $CI_COMMIT_TAG == ""
- if: $CI_COMMIT_BRANCH =~ /^release/ - if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG - if: $CI_COMMIT_TAG

View File

@ -8,8 +8,6 @@ plugins {
group = "com.knecon.fforesight" group = "com.knecon.fforesight"
val documentVersion by rootProject.extra { "4.433.0" }
java.sourceCompatibility = JavaVersion.VERSION_17 java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17 java.targetCompatibility = JavaVersion.VERSION_17
@ -44,19 +42,6 @@ tasks.jacocoTestReport {
} }
allprojects { allprojects {
tasks.withType<Javadoc> {
options {
this as StandardJavadocDocletOptions
addBooleanOption("Xdoclint:none", true)
addStringOption("Xmaxwarns", "1")
}
}
pmd {
setConsoleOutput(true)
}
publishing { publishing {
publications { publications {
create<MavenPublication>(name) { create<MavenPublication>(name) {
@ -79,7 +64,6 @@ java {
withJavadocJar() withJavadocJar()
} }
repositories { repositories {
mavenLocal() mavenLocal()
mavenCentral() mavenCentral()

View File

@ -0,0 +1,28 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@Schema(description = "Object containing the complete document layout parsing information. It is split into 4 categories, structure, text, positions and pages: " + "The document tree structure of SemanticNodes such as Section, Paragraph, Headline, etc. " + "The text, which is stored as separate blocks of data. " + "The text positions, which are also stored as separate blocks. The Blocks are equal to the text blocks in length and order. " + "The page information.")
public class DocumentData implements Serializable {
@Schema(description = "Contains information about the document's pages.")
DocumentPage[] documentPages;
@Schema(description = "Contains information about the document's text.")
DocumentTextData[] documentTextData;
@Schema(description = "Contains information about the document's text positions.")
DocumentPositionData[] documentPositions;
@Schema(description = "Contains information about the document's semantic structure.")
DocumentStructure documentStructure;
}

View File

@ -0,0 +1,30 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing information about the document's pages.")
public class DocumentPage implements Serializable {
@Schema(description = "The page number, starting with 1.")
int number;
@Schema(description = "The page height in PDF user units.", example = "792")
int height;
@Schema(description = "The page width in PDF user units.", example = "694")
int width;
@Schema(description = "The page rotation as specified by the PDF.", example = "90", allowableValues = {"0", "90", "180", "270"})
int rotation;
}

View File

@ -0,0 +1,28 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing text positional information of a specific text block. A document is split into multiple text blocks, which are supposed to be read in order. Every text block can only occur on a single page.")
public class DocumentPositionData implements Serializable {
@Schema(description = "Identifier of the text block.")
Long id;
@Schema(description = "For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate. This is required due to the text and position coordinates not being equal.")
int[] stringIdxToPositionIdx;
@Schema(description = "The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block. The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner. In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.")
float[][] positions;
}

View File

@ -0,0 +1,172 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.awt.geom.Rectangle2D;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing information about the parsed tree structure of the SemanticNodes, such as Section, Paragraph, Headline etc inside of the document.")
public class DocumentStructure implements Serializable {
@Schema(description = "The root EntryData represents the Document.")
EntryData root;
@Schema(description = "Object containing the extra field names, a table has in its properties field.")
public static class TableProperties implements Serializable {
public static final String NUMBER_OF_ROWS = "numberOfRows";
public static final String NUMBER_OF_COLS = "numberOfCols";
}
@Schema(description = "Object containing the extra field names, an Image has in its properties field.")
public static class ImageProperties implements Serializable {
public static final String TRANSPARENT = "transparent";
public static final String IMAGE_TYPE = "imageType";
public static final String POSITION = "position";
public static final String ID = "id";
public static final String REPRESENTATION_HASH = "representationHash";
}
@Schema(description = "Object containing the extra field names, a table cell has in its properties field.")
public static class TableCellProperties implements Serializable {
public static final String B_BOX = "bBox";
public static final String ROW = "row";
public static final String COL = "col";
public static final String HEADER = "header";
}
@Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.")
public static class DuplicateParagraphProperties implements Serializable {
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
}
public static final String RECTANGLE_DELIMITER = ";";
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER))
.map(Float::parseFloat)
.toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public static double[] parseRepresentationVector(String representationHash) {
String[] stringArray = representationHash.split("[,\\s]+");
double[] doubleArray = new double[stringArray.length];
for (int i = 0; i < stringArray.length; i++) {
doubleArray[i] = Double.parseDouble(stringArray[i]);
}
return doubleArray;
}
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root;
}
EntryData entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(root), root.children.stream())
.flatMap(DocumentStructure::flatten);
}
public String toString() {
return String.join("\n",
streamAllEntries().map(EntryData::toString)
.toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry),
entry.children.stream()
.flatMap(DocumentStructure::flatten));
}
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing information of a SemanticNode and also structuring the layout with children.")
public static class EntryData implements Serializable {
@Schema(description = "Type of the semantic node.", allowableValues = {"DOCUMENT", "SECTION", "PARAGRAPH", "HEADLINE", "TABLE", "TABLE_CELL", "HEADER", "FOOTER", "IMAGE"})
NodeType type;
@Schema(description = "Specifies the position in the parsed tree structure.", example = "[1, 0, 2]")
int[] treeId;
@Schema(description = "Specifies the text block IDs associated with this semantic node. The value should be joined with the DocumentTextData/DocumentPositionData. Is empty, if no text block is directly associated with this semantic node. Only Paragraph, Headline, Header or Footer is directly associated with a text block.", example = "[1]")
Long[] atomicBlockIds;
@Schema(description = "Specifies the pages this semantic node appears on. The value should be joined with the PageData.", example = "[1, 2, 3]")
Long[] pageNumbers;
@Schema(description = "Some semantic nodes have additional information, this information is stored in this Map. The extra fields are specified by the Properties subclasses.", example = "For a Table: {\"numberOfRows\": 3, \"numberOfCols\": 4}")
Map<String, String> properties;
@Schema(description = "All child Entries of this Entry.", example = "[1, 2, 3]")
List<EntryData> children;
@Schema(description = "Describes the origin of the semantic node", example = "[ALGORITHM]")
Set<LayoutEngine> engines;
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (int i : treeId) {
sb.append(i);
sb.append(",");
}
sb.delete(sb.length() - 1, sb.length());
sb.append("]: ");
sb.append(type);
sb.append(" atbs = ");
sb.append(atomicBlockIds.length);
return sb.toString();
}
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing text information of a specific text block. A document is split into multiple text blocks, which are supposed to be read in order. Every text block can only occur on a single page.")
public class DocumentTextData implements Serializable {
@Schema(description = "Identifier of the text block.")
Long id;
@Schema(description = "The page the text block occurs on.")
Long page;
@Schema(description = "The text the text block.")
String searchText;
@Schema(description = "Each text block is assigned a number on a page, starting from 0.")
int numberOnPage;
@Schema(description = "The text blocks are ordered, this number represents the start of the text block as a string offset.")
int start;
@Schema(description = "The text blocks are ordered, this number represents the end of the text block as a string offset.")
int end;
@Schema(description = "The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.", example = "[5, 10]")
int[] lineBreaks;
}

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
public enum LayoutEngine {
ALGORITHM,
AI,
OUTLINE
}

View File

@ -0,0 +1,23 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import java.util.Locale;
public enum NodeType implements Serializable {
DOCUMENT,
SECTION,
SUPER_SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
}

View File

@ -19,16 +19,6 @@ public class SimplifiedText {
@Schema(description = "Number of pages in the entire document.") @Schema(description = "Number of pages in the entire document.")
private int numberOfPages; private int numberOfPages;
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.") @Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
@Builder.Default
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>(); private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
@Schema(description = "A list of the main section numbers ")
@Builder.Default
private List<String> mainSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the header section numbers ")
@Builder.Default
private List<String> headerSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the footer section numbers ")
@Builder.Default
private List<String> footerSectionNumbers = new ArrayList<>();
} }

View File

@ -1,7 +1,5 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema; import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
@ -15,8 +13,6 @@ public class StructureObject {
@Schema(description = "The ID of this StructureObject.") @Schema(description = "The ID of this StructureObject.")
Integer structureObjectNumber; Integer structureObjectNumber;
@Schema(description = "The Tree ID of this StructureObject.")
List<Integer> treeId;
@Schema(description = "This value indicates the start of the string offsets in this Object, with respect to the reading order.") @Schema(description = "This value indicates the start of the string offsets in this Object, with respect to the reading order.")
int page; int page;
@Schema(description = "This stringOffset indicates the start of the string offsets in this Object, with respect to the reading order of the entire document. It is equal to the previous' StructureObject stringOffset + its length.") @Schema(description = "This stringOffset indicates the start of the string offsets in this Object, with respect to the reading order of the entire document. It is equal to the previous' StructureObject stringOffset + its length.")

View File

@ -8,20 +8,13 @@ import lombok.Builder;
@Builder @Builder
@Schema(description = "Object containing information about the layout parsing.") @Schema(description = "Object containing information about the layout parsing.")
public record LayoutParsingFinishedEvent( public record LayoutParsingFinishedEvent(
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") // @Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.")
Map<String, String> identifier, Map<String, String> identifier,//
@Schema(description = "The duration of a single layout parsing in ms.")
@Schema(description = "The duration of a single layout parsing in ms.") // long duration,//
long duration, @Schema(description = "The number of pages of the parsed document.")
int numberOfPages,//
@Schema(description = "The number of pages of the parsed document.") // @Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.")
int numberOfPages, String message) {
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
String message,
@Schema(description = "The app version of the layout parser.") //
String layoutParserVersion
) {
} }

View File

@ -2,9 +2,6 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public class LayoutParsingQueueNames { public class LayoutParsingQueueNames {
public static final String LAYOUT_PARSING_REQUEST_QUEUE_PREFIX = "layout_parsing_request"; public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue";
public static final String LAYOUT_PARSING_REQUEST_EXCHANGE = "layout_parsing_request_exchange"; public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue";
public static final String LAYOUT_PARSING_RESPONSE_QUEUE_PREFIX = "layout_parsing_response";
public static final String LAYOUT_PARSING_RESPONSE_EXCHANGE = "layout_parsing_response_exchange";
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_error";
} }

View File

@ -19,6 +19,7 @@ public record LayoutParsingRequest(
@Schema(description = "Path to the original PDF file.")// @Schema(description = "Path to the original PDF file.")//
@NonNull String originFileStorageId,// @NonNull String originFileStorageId,//
@Schema(description = "Optional Path to the table extraction file.")// @Schema(description = "Optional Path to the table extraction file.")//
Optional<String> tablesFileStorageId,// Optional<String> tablesFileStorageId,//
@Schema(description = "Optional Path to the image classification file.")// @Schema(description = "Optional Path to the image classification file.")//
@ -36,12 +37,9 @@ public record LayoutParsingRequest(
@NonNull String positionBlockFileStorageId,// @NonNull String positionBlockFileStorageId,//
@Schema(description = "Path where the Document Pages File will be stored.")// @Schema(description = "Path where the Document Pages File will be stored.")//
@NonNull String pageFileStorageId,// @NonNull String pageFileStorageId,//
@Schema(description = "Path where the Document Markdown File will be stored.")//
Optional<String> documentMarkdownFileStorageId,//
@Schema(description = "Path where the Simplified Text File will be stored.")// @Schema(description = "Path where the Simplified Text File will be stored.")//
@NonNull String simplifiedTextStorageId,// @NonNull String simplifiedTextStorageId,//
@Schema(description = "Path where the Viewer Document PDF will be stored.")// @Schema(description = "Path where the Viewer Document PDF will be stored.")//
@NonNull String viewerDocumentStorageId @NonNull String viewerDocumentStorageId) {
) {
} }

View File

@ -8,20 +8,16 @@ description = "layoutparser-service-processor"
val jacksonVersion = "2.15.2" val jacksonVersion = "2.15.2"
val pdfBoxVersion = "3.0.0" val pdfBoxVersion = "3.0.0"
dependencies { dependencies {
implementation(project(":layoutparser-service-internal-api")) implementation(project(":layoutparser-service-internal-api"))
implementation(project(":viewer-doc-processor")) implementation(project(":viewer-doc-processor"))
implementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}") implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.144.0") {
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") {
exclude("org.springframework.boot", "spring-boot-starter-security") exclude("org.springframework.boot", "spring-boot-starter-security")
exclude("org.springframework.boot", "spring-boot-starter-validation") exclude("org.springframework.boot", "spring-boot-starter-validation")
} }
implementation("com.knecon.fforesight:tenant-commons:0.30.0") { implementation("com.knecon.fforesight:tenant-commons:0.21.0")
exclude("com.iqser.red.commons", "storage-commons") implementation("com.iqser.red.commons:storage-commons:2.45.0")
}
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}") implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
@ -29,12 +25,5 @@ dependencies {
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}") implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3") implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
implementation("org.jgrapht:jgrapht-core:1.5.2") implementation("org.jgrapht:jgrapht-core:1.5.2")
implementation("org.apache.pdfbox:jbig2-imageio:3.0.4")
implementation("com.github.jai-imageio:jai-imageio-core:1.4.0")
implementation("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
implementation("org.tinspin:tinspin-indexes:2.1.3") implementation("org.tinspin:tinspin-indexes:2.1.3")
implementation("org.commonmark:commonmark:0.22.0")
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
implementation("com.pdftron:PDFNet:10.11.0")
implementation("org.apache.commons:commons-text:1.12.0")
} }

View File

@ -2,15 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format; import static java.lang.String.format;
import java.awt.geom.AffineTransform; import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
@ -20,35 +18,27 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter;
@ -56,6 +46,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
@ -65,15 +56,18 @@ import com.knecon.fforesight.service.layoutparser.processor.services.blockificat
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClarifyndClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import io.micrometer.observation.Observation; import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry; import io.micrometer.observation.ObservationRegistry;
@ -88,32 +82,32 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class LayoutParsingPipeline { public class LayoutParsingPipeline {
final ImageServiceResponseAdapter imageServiceResponseAdapter; ImageServiceResponseAdapter imageServiceResponseAdapter;
final CvTableParsingAdapter cvTableParsingAdapter; CvTableParsingAdapter cvTableParsingAdapter;
final LayoutParsingStorageService layoutParsingStorageService; LayoutParsingStorageService layoutParsingStorageService;
final SectionsBuilderService sectionsBuilderService; SectionsBuilderService sectionsBuilderService;
final SimplifiedSectionTextService simplifiedSectionTextService; RedactManagerClassificationService redactManagerClassificationService;
final RulingCleaningService rulingCleaningService; DocuMineClassificationService docuMineClassificationService;
final TableExtractionService tableExtractionService; SimplifiedSectionTextService simplifiedSectionTextService;
final DocuMineBlockificationService docuMineBlockificationService; BodyTextFrameService bodyTextFrameService;
final RedactManagerBlockificationService redactManagerBlockificationService; RulingCleaningService rulingCleaningService;
final BlockificationPostprocessingService blockificationPostprocessingService; TableExtractionService tableExtractionService;
final DocstrumBlockificationService docstrumBlockificationService; DocuMineBlockificationService docuMineBlockificationService;
final LayoutGridService layoutGridService; RedactManagerBlockificationService redactManagerBlockificationService;
final ObservationRegistry observationRegistry; BlockificationPostprocessingService blockificationPostprocessingService;
final VisualLayoutParsingAdapter visualLayoutParsingAdapter; DocstrumBlockificationService docstrumBlockificationService;
final GraphicExtractorService graphicExtractorService; LayoutGridService layoutGridService;
final OutlineExtractorService outlineExtractorService; ObservationRegistry observationRegistry;
final SectionTreeBuilderService sectionTreeBuilderService; VisualLayoutParsingAdapter visualLayoutParsingAdapter;
final SectionTreeEnhancementService sectionTreeEnhancementService; ClarifyndClassificationService clarifyndClassificationService;
final LayoutParserSettings settings; GraphicExtractorService graphicExtractorService;
final ClassificationService classificationService; OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
@Value("${LAYOUT_PARSER_VERSION:}") TOCEnrichmentService tocEnrichmentService;
private String layoutParserVersion; LayoutparserSettings settings;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -122,23 +116,32 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) // File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
.orElse(originFile); File viewerDocumentFile = originFile;
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId() VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
.map(layoutParsingStorageService::getVisualLayoutParsingFile) if (layoutParsingRequest.visualLayoutParsingFileId()
.orElse(new VisualLayoutParsingResponse()); .isPresent()) {
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId() visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getImagesFile) .get());
.orElse(new ImageServiceResponse()); }
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null // ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(); if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
}
ClassificationDocument classificationDocument = parseLayout(layoutParsingType, TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
}
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
originFile, originFile,
imageServiceResponse, imageServiceResponse,
tableServiceResponse, tableServiceResponse,
@ -147,37 +150,38 @@ public class LayoutParsingPipeline {
log.info("Building document graph for {}", layoutParsingRequest.identifier()); log.info("Building document graph for {}", layoutParsingRequest.identifier());
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument); Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false); layoutGridService.addLayoutGrid(viewerDocumentFile,
documentGraph,
viewerDocumentFile,
false,
layoutParsingRequest.visualLayoutParsingFileId()
.isPresent());
log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document())); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
if (layoutParsingRequest.documentMarkdownFileStorageId() layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
.isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
if (layoutParsingRequest.researchDocumentStorageId() != null) { if (layoutParsingRequest.researchDocumentStorageId() != null) {
log.info("Building research document data for {}", layoutParsingRequest.identifier()); log.info("Building research document data for {}", layoutParsingRequest.identifier());
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentWithVisualization.document()); var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
} }
if (!viewerDocumentFile.equals(originFile)) { if (!viewerDocumentFile.equals(originFile)) {
assert !viewerDocumentFile.exists() || viewerDocumentFile.delete(); viewerDocumentFile.delete();
} }
assert !originFile.exists() || originFile.delete(); originFile.delete();
return LayoutParsingFinishedEvent.builder() return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier()) .identifier(layoutParsingRequest.identifier())
.numberOfPages(documentWithVisualization.document().getNumberOfPages()) .numberOfPages(documentGraph.getNumberOfPages())
.duration(System.currentTimeMillis() - start) .duration(System.currentTimeMillis() - start)
.message(format(""" .message(format("""
Layout parsing has finished in %.02f s. Layout parsing has finished in %.02f s.
@ -192,22 +196,21 @@ public class LayoutParsingPipeline {
Viewer Doc: %s""", Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000, ((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(), layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()), buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(), layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(), layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(), layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(), layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(), layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId())) layoutParsingRequest.viewerDocumentStorageId()))
.layoutParserVersion(layoutParserVersion)
.build(); .build();
} }
private DocumentWithVisualization observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) { private Document observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
AtomicReference<DocumentWithVisualization> documentReference = new AtomicReference<>(); AtomicReference<Document> documentReference = new AtomicReference<>();
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry) Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
.contextualName("build-document-graph") .contextualName("build-document-graph")
@ -246,16 +249,19 @@ public class LayoutParsingPipeline {
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument(); ClassificationDocument classificationDocument = new ClassificationDocument();
if (settings.isDebug() || identifier.containsKey("debug")) { if (settings.isDebug() || identifier.containsKey("debug")) {
classificationDocument.getLayoutDebugLayer().setActive(true); classificationDocument.getVisualizations().setActive(true);
} }
List<ClassificationPage> classificationPages = new ArrayList<>(); List<ClassificationPage> classificationPages = new ArrayList<>();
OutlineObject lastProcessedOutlineObject = null;
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); // parsing the structure elements could be useful as well
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
}
long pageCount = originDocument.getNumberOfPages(); long pageCount = originDocument.getNumberOfPages();
@ -279,55 +285,69 @@ public class LayoutParsingPipeline {
stripper.setStartPage(pageNumber); stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber); stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage); stripper.setPdpage(pdPage);
stripper.getText(originDocument);
List<Word> words = stripper.getWords();
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
var lines = TextPositionOperations.groupByLine(new HashSet<>(words)); stripper.setSortByPosition(true);
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
words = TextPositionOperations.sortWords(lines);
} }
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber); stripper.getText(originDocument);
List<TextPositionSequence> words = stripper.getTextPositionSequences();
classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox(); PDRectangle pdr = pdPage.getMediaBox();
List<Ruling> rulings = stripper.getRulings(); int rotation = pdPage.getRotation();
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber); boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
PDRectangle cropbox = pdPage.getCropBox();
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage); PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation); List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber); classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false); List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(),
false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream() .addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
ImageType.GRAPHIC,
false,
stripper.getPageNumber(),
""))
.toList()); .toList());
ClassificationPage classificationPage = switch (layoutParsingType) { ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer()); case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType); docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType); docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
}; };
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation); classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation); if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
notFoundOutlineObject = lastProcessedOutlineObject;
}
if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects);
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
}
}
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
@ -355,67 +375,40 @@ public class LayoutParsingPipeline {
originDocument.close(); originDocument.close();
classificationService.classify(classificationDocument, layoutParsingType, identifier); log.info("Calculating BodyTextFrame for {}", identifier);
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
}
log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) {
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
redactManagerClassificationService.classifyDocument(classificationDocument);
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
}
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument); List<TextPageBlock> headlines = classificationDocument.getPages()
classificationDocument.setSectionTree(sectionTree); .stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
classificationDocument.setTableOfContents(tableOfContents);
log.info("Building Sections for {}", identifier); log.info("Building Sections for {}", identifier);
switch (layoutParsingType) { switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument); case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument); default -> tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
} }
return classificationDocument; return classificationDocument;
} }
private static void updateClassificationPage(PDPage pdPage,
PDRectangle pdr,
ClassificationPage classificationPage,
CleanRulings cleanRulings,
int pageNumber,
PageInformation pageInformation) {
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth((float) pageInformation.width());
classificationPage.setPageHeight((float) pageInformation.height());
}
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
for (TextDirection dir : TextDirection.values()) {
double averageRotation = words.stream()
.map(Word::getCharacters)
.flatMap(Collection::stream)
.map(Character::getTextPosition)
.filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average()
.orElse(0);
if (averageRotation == 0) {
continue;
}
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
for (Word word : words) {
if (!dir.equals(word.getDir())) {
continue;
}
word.transform(rotateInstance);
}
}
}
private void addNumberOfPagesToTrace(int numberOfPages, long size) { private void addNumberOfPagesToTrace(int numberOfPages, long size) {
if (observationRegistry.getCurrentObservation() != null) { if (observationRegistry.getCurrentObservation() != null) {
@ -457,10 +450,10 @@ public class LayoutParsingPipeline {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame. // Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) { for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) { if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getWords() == null) { if (((TextPageBlock) textBlock).getSequences() == null) {
continue; continue;
} }
for (Word word : ((TextPageBlock) textBlock).getWords()) { for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight()); classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont()); classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize()); classificationPage.getFontSizeCounter().add(word.getFontSize());

View File

@ -5,7 +5,10 @@ import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; import com.google.common.base.Strings;
import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService;
import com.knecon.fforesight.service.viewerdoc.service.pdftron.PDFTronViewerDocumentService;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import io.micrometer.observation.ObservationRegistry; import io.micrometer.observation.ObservationRegistry;
@ -15,9 +18,14 @@ public class LayoutParsingServiceProcessorConfiguration {
@Bean @Bean
@Autowired @Autowired
public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) { public IViewerDocumentService viewerDocumentService(ObservationRegistry registry, LayoutparserSettings settings) {
if (!Strings.isNullOrEmpty(settings.getPdftronLicense())) {
return new PDFTronViewerDocumentService(registry, settings.getPdftronLicense());
} else {
return new ViewerDocumentService(registry);
}
return new PDFTronViewerDocumentService(registry);
} }
} }

View File

@ -1,31 +1,26 @@
package com.knecon.fforesight.service.layoutparser.processor; package com.knecon.fforesight.service.layoutparser.processor;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
import java.util.Optional; import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import org.springframework.core.task.TaskExecutor;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
import com.knecon.fforesight.tenantcommons.TenantContext; import com.knecon.fforesight.tenantcommons.TenantContext;
import io.micrometer.observation.annotation.Observed; import io.micrometer.observation.annotation.Observed;
@ -41,9 +36,6 @@ public class LayoutParsingStorageService {
private final StorageService storageService; private final StorageService storageService;
private final ObjectMapper objectMapper; private final ObjectMapper objectMapper;
private final TaskExecutor taskExecutor;
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file") @Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
public File getOriginFile(String storageId) throws IOException { public File getOriginFile(String storageId) throws IOException {
@ -61,18 +53,11 @@ public class LayoutParsingStorageService {
} }
File tempFile = createTempFile("viewerDocument", ".pdf"); File tempFile = createTempFile("viewerDocument", ".pdf");
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile); storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) {
assert tempFile.delete();
return Optional.empty();
}
return Optional.of(tempFile); return Optional.of(tempFile);
} }
@SneakyThrows public ImageServiceResponse getImagesFile(String storageId) throws IOException {
public ImageServiceResponse getImagesFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) { try (InputStream inputStream = getObject(storageId)) {
@ -83,8 +68,7 @@ public class LayoutParsingStorageService {
} }
@SneakyThrows public TableServiceResponse getTablesFile(String storageId) throws IOException {
public TableServiceResponse getTablesFile(String storageId) {
try (var tableClassificationStream = getObject(storageId)) { try (var tableClassificationStream = getObject(storageId)) {
@ -94,45 +78,22 @@ public class LayoutParsingStorageService {
} }
} }
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) throws IOException {
@SneakyThrows
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) { try (InputStream inputStream = getObject(storageId)) {
return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class); VisualLayoutParsingResponse visualLayoutParsingResponse = objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
return visualLayoutParsingResponse;
} }
} }
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data") @Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) { public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(), storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
layoutParsingRequest.structureFileStorageId(), storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData());
documentData.getDocumentStructure()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages());
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.textBlockFileStorageId(),
documentData.getDocumentTextData());
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
documentData.getDocumentPositionData());
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.pageFileStorageId(),
documentData.getDocumentPages());
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
} }
@ -193,16 +154,4 @@ public class LayoutParsingStorageService {
} }
} }
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file")
public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) {
try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) {
storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream);
}
}
} }

View File

@ -13,8 +13,9 @@ import lombok.experimental.FieldDefaults;
@Configuration @Configuration
@ConfigurationProperties("layoutparser") @ConfigurationProperties("layoutparser")
@FieldDefaults(level = AccessLevel.PRIVATE) @FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutParserSettings { public class LayoutparserSettings {
boolean debug; boolean debug;
LayoutParsingType layoutParsingTypeOverride; LayoutParsingType layoutParsingTypeOverride;
String pdftronLicense;
} }

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum; package com.knecon.fforesight.service.layoutparser.processor.docstrum;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -18,8 +17,8 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Zon
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -27,7 +26,6 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor @RequiredArgsConstructor
public class DocstrumSegmentationService { public class DocstrumSegmentationService {
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
private final NearestNeighbourService nearestNeighbourService; private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService; private final SpacingService spacingService;
private final LineBuilderService lineBuilderService; private final LineBuilderService lineBuilderService;
@ -35,57 +33,30 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService; private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<Word> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) { public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class); List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO); return readingOrderService.resolve(zones, xyOrder);
directionCounts.put(TextDirection.ZERO, newZones.size());
List<Zone> zones = new ArrayList<>(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
zones.addAll(newZones);
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
} }
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) { private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) {
int total = directionCounts.values() List<RedTextPosition> positions = textPositions.stream()
.stream()
.mapToInt(i -> i).sum();
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
}
return false;
}
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
List<Character> characters = textPositions.stream()
.filter(t -> t.getDir() == direction) .filter(t -> t.getDir() == direction)
.map(Word::getCharacters) .map(TextPositionSequence::getTextPositions)
.flatMap(List::stream) .flatMap(List::stream)
.toList(); .toList();
List<Character> characters = positions.stream()
.map(Character::new)
.collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters); nearestNeighbourService.findNearestNeighbors(characters);
double characterSpacing = spacingService.computeCharacterSpacing(characters); double characterSpacing = spacingService.computeCharacterSpacing(characters);

View File

@ -15,16 +15,10 @@ public class AngleFilter {
public boolean matches(Neighbor neighbor) { public boolean matches(Neighbor neighbor) {
return matches(neighbor.getAngle());
}
public boolean matches(double angle) {
if (lowerAngle <= upperAngle) { if (lowerAngle <= upperAngle) {
return lowerAngle <= angle && angle < upperAngle; return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
} else { } else {
return lowerAngle <= angle || angle < upperAngle; return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
} }
} }

View File

@ -7,12 +7,8 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data @Data
@SuperBuilder
@NoArgsConstructor
public abstract class BoundingBox { public abstract class BoundingBox {
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom. // Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
@ -23,7 +19,7 @@ public abstract class BoundingBox {
// This rotates completely in 90 degree steps with page rotation. // This rotates completely in 90 degree steps with page rotation.
// Needs to be used when writing to a PDF. // Needs to be used when writing to a PDF.
// Also, these are definitely correct and should be used whenever possible. // Also, these are definitely correct and should be used whenever possible.
protected Rectangle2D bBoxPdf; protected Rectangle2D bBoxInitialUserSpace;
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
@ -54,25 +50,25 @@ public abstract class BoundingBox {
public double getPdfMinX() { public double getPdfMinX() {
return bBoxPdf.getMinX(); return bBoxInitialUserSpace.getMinX();
} }
public double getPdfMaxX() { public double getPdfMaxX() {
return bBoxPdf.getMaxX(); return bBoxInitialUserSpace.getMaxX();
} }
public double getPdfMinY() { public double getPdfMinY() {
return bBoxPdf.getMinY(); return bBoxInitialUserSpace.getMinY();
} }
public double getPdfMaxY() { public double getPdfMaxY() {
return bBoxPdf.getMaxY(); return bBoxInitialUserSpace.getMaxY();
} }
@ -133,31 +129,13 @@ public abstract class BoundingBox {
} }
public boolean intersectsX(BoundingBox other, float threshold) { public boolean intersectsY(BoundingBox other) {
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
}
public boolean intersectsPdf(BoundingBox other) {
return this.intersectsXPdf(other) && this.intersectsYPdf(other);
}
public boolean intersectsPdf(BoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsXPdf(other, xThreshold) && this.intersectsYPdf(other, yThreshold);
}
public boolean intersectsYPdf(BoundingBox other) {
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY(); return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
} }
public boolean intersectsY(BoundingBox other) { public boolean intersectsYJava(BoundingBox other) {
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY(); return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
} }
@ -165,31 +143,25 @@ public abstract class BoundingBox {
public boolean intersectsY(BoundingBox other, float threshold) { public boolean intersectsY(BoundingBox other, float threshold) {
return this.getY() - threshold <= other.getMaxY() && this.getMaxY() + threshold >= other.getY();
}
public boolean intersectsYPdf(BoundingBox other, float threshold) {
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY(); return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
} }
public boolean intersectsXPdf(BoundingBox other) {
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
}
public boolean intersectsX(BoundingBox other) { public boolean intersectsX(BoundingBox other) {
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
}
public boolean intersectsXJava(BoundingBox other) {
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX(); return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
} }
public boolean intersectsXPdf(BoundingBox other, float threshold) { public boolean intersectsX(BoundingBox other, float threshold) {
return this.getPdfMinX() - threshold <= other.getPdfMaxX() && this.getMaxX() + threshold >= other.getPdfMinX(); return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
} }
@ -198,8 +170,8 @@ public abstract class BoundingBox {
this.bBox = components.stream() this.bBox = components.stream()
.map(BoundingBox::getBBox) .map(BoundingBox::getBBox)
.collect(RectangleTransformations.collectBBox()); .collect(RectangleTransformations.collectBBox());
this.bBoxPdf = components.stream() this.bBoxInitialUserSpace = components.stream()
.map(BoundingBox::getBBoxPdf) .map(BoundingBox::getBBoxInitialUserSpace)
.collect(RectangleTransformations.collectBBox()); .collect(RectangleTransformations.collectBBox());
} }
@ -225,55 +197,57 @@ public abstract class BoundingBox {
public double horizontalDistance(BoundingBox other) { public double horizontalDistance(BoundingBox other) {
double rect1Right = getMaxX(); Rectangle2D left;
double rect1Left = getMinX(); Rectangle2D right;
double rect2Right = other.getMaxX(); if (this.leftOf(other)) {
double rect2Left = other.getMinX(); left = this.getBBox();
right = other.getBBox();
if (rect1Left > rect2Right || rect2Left > rect1Right) {
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
} else { } else {
return 0; left = other.getBBox();
right = this.getBBox();
} }
return Math.max(0, right.getMinX() - left.getMaxX());
} }
public double verticalDistance(BoundingBox other) { public double verticalDistance(BoundingBox other) {
double rect1Top = getMaxY(); Rectangle2D bottom;
double rect1Bottom = getMinY(); Rectangle2D top;
double rect2Top = other.getMaxY(); if (this.isAbove(other)) {
double rect2Bottom = other.getMinY(); top = this.getBBox();
bottom = other.getBBox();
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
} else { } else {
return 0; bottom = this.getBBox();
top = other.getBBox();
} }
return Math.max(0, bottom.getMinY() - top.getMaxY());
} }
public boolean rightOf(BoundingBox other) { public boolean rightOf(BoundingBox other) {
return this.intersectsY(other) && other.getMaxX() <= this.getMinX(); return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
} }
public boolean leftOf(BoundingBox other) { public boolean leftOf(BoundingBox other) {
return this.intersectsY(other) && other.getMinX() >= this.getMaxX(); return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
} }
public boolean isAbove(BoundingBox other) { public boolean isAbove(BoundingBox other) {
return this.intersectsX(other) && other.getMinY() >= this.getMaxY(); return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
} }
public boolean isBelow(BoundingBox other) { public boolean isBelow(BoundingBox other) {
return this.intersectsX(other) && this.getMinY() >= other.getMaxY(); return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
} }
} }

View File

@ -35,7 +35,7 @@ public class Character {
public double getHeight() { public double getHeight() {
return textPosition.getHeightDirAdj(); return textPosition.getHeightDir();
} }
@ -65,9 +65,9 @@ public class Character {
double s = Math.sin(-0); double s = Math.sin(-0);
double c = Math.cos(-0); double c = Math.cos(-0);
xs[0] = c * x - s * y; xs[0] = c * x - s * y;
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDirAdj()); xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
xs[2] = c * other.x - s * other.y; xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDirAdj()); xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs); Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);

View File

@ -1,30 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD_ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.STANDARD;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
@Data @Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false) @EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Line extends TextBoundingBox { public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.17; private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
private final double x0; private final double x0;
@ -36,13 +26,16 @@ public class Line extends TextBoundingBox {
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
private final double y1; private final double y1;
private FontStyle fontStyle; private final double height;
private final List<Word> words; private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
public Line(List<Character> characters, double wordSpacing) { public Line(List<Character> characters, double wordSpacing) {
this.characters = characters;
if (characters.size() >= 2) { if (characters.size() >= 2) {
// linear regression // linear regression
double sx = 0.0; double sx = 0.0;
@ -71,43 +64,9 @@ public class Line extends TextBoundingBox {
this.y0 = character.getY() - dy; this.y0 = character.getY() - dy;
this.y1 = character.getY() + dy; this.y1 = character.getY() + dy;
} }
this.words = new ArrayList<>(); height = computeHeight();
computeWords(characters, wordSpacing * WORD_DISTANCE_MULTIPLIER); computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox(); buildBBox();
computeFontStyle();
}
public Line(List<Word> words) {
this.words = words;
buildBBox();
x0 = getMinX();
y0 = getMinY();
x1 = getMaxX();
y1 = getMaxY();
computeFontStyle();
}
private void computeFontStyle() {
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
for (FontStyle fontStyle : FontStyle.values()) {
fontStyleCounter.put(fontStyle, new AtomicInteger(0));
}
for (Word word : words) {
switch (word.getFontStyle()) {
case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement();
case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement();
case ITALIC -> fontStyleCounter.get(FontStyle.ITALIC).getAndIncrement();
case BOLD_ITALIC -> fontStyleCounter.get(FontStyle.BOLD_ITALIC).getAndIncrement();
}
}
fontStyle = fontStyleCounter.entrySet()
.stream()
.max(Comparator.comparing(entry -> entry.getValue().get()))
.map(Map.Entry::getKey).orElse(FontStyle.REGULAR);
} }
@ -123,6 +82,14 @@ public class Line extends TextBoundingBox {
} }
private double computeHeight() {
return characters.stream()
.map(Character::getHeight)
.reduce(0d, Double::sum) / characters.size();
}
public double angularDifference(Line j) { public double angularDifference(Line j) {
double diff = Math.abs(getAngle() - j.getAngle()); double diff = Math.abs(getAngle() - j.getAngle());
@ -155,22 +122,19 @@ public class Line extends TextBoundingBox {
} }
private void computeWords(List<Character> characters, double wordSpacing) { private void computeWords(double wordSpacing) {
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours. TextPositionSequence word = new TextPositionSequence();
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
Word word = new Word();
Character previous = null; Character previous = null;
for (Character current : characters) { for (Character current : characters) {
if (previous != null) { if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj(); double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) { if (dist > wordSpacing) {
words.add(word); words.add(word);
word = new Word(); word = new TextPositionSequence();
} }
} }
word.add(current); word.getTextPositions().add(current.getTextPosition());
previous = current; previous = current;
} }
words.add(word); words.add(word);
@ -179,7 +143,9 @@ public class Line extends TextBoundingBox {
private void buildBBox() { private void buildBBox() {
this.setToBBoxOfComponents(words); this.setToBBoxOfComponents(characters.stream()
.map(Character::getTextPosition)
.toList());
} }

View File

@ -1,180 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.SuperBuilder;
@Getter
@Setter
@SuperBuilder
@NoArgsConstructor
@EqualsAndHashCode(callSuper = false)
public abstract class TextBoundingBox extends BoundingBox {
protected Rectangle2D bBoxDirAdj;
protected TextDirection dir;
@Override
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
super.setToBBoxOfComponents(components);
this.bBoxDirAdj = components.stream()
.filter(c -> c instanceof TextBoundingBox)
.map(c -> (TextBoundingBox) c)
.map(TextBoundingBox::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
Set<TextDirection> textDirections = components.stream()
.filter(c -> c instanceof TextBoundingBox)
.map(c -> (TextBoundingBox) c)
.map(TextBoundingBox::getDir)
.collect(Collectors.toSet());
if (textDirections.isEmpty()) {
dir = TextDirection.ZERO;
} else if (textDirections.size() > 1) {
throw new IllegalArgumentException("More than one text direction found");
} else {
dir = textDirections.iterator().next();
}
}
public double getXDirAdj() {
return this.bBoxDirAdj.getX();
}
public double getYDirAdj() {
return this.bBoxDirAdj.getY();
}
public double getWidthDirAdj() {
return this.bBoxDirAdj.getWidth();
}
public double getHeightDirAdj() {
return this.bBoxDirAdj.getHeight();
}
public double getMaxXDirAdj() {
return this.bBoxDirAdj.getMaxX();
}
public double getMaxYDirAdj() {
return this.bBoxDirAdj.getMaxY();
}
public double getCenterYDirAdj() {
return this.bBoxDirAdj.getCenterY();
}
public double getCenterXDirAdj() {
return this.bBoxDirAdj.getCenterX();
}
public double horizontalDistanceDirAdj(TextBoundingBox other) {
double rect1Right = getMaxXDirAdj();
double rect1Left = getXDirAdj();
double rect2Right = other.getMaxXDirAdj();
double rect2Left = other.getXDirAdj();
if (rect1Left > rect2Right || rect2Left > rect1Right) {
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
} else {
return 0;
}
}
public double verticalDistanceDirAdj(TextBoundingBox other) {
double rect1Top = getMaxYDirAdj();
double rect1Bottom = getYDirAdj();
double rect2Top = other.getMaxYDirAdj();
double rect2Bottom = other.getYDirAdj();
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
} else {
return 0;
}
}
public boolean intersectsDirAdj(TextBoundingBox other) {
return this.intersectsXDirAdj(other) && this.intersectsYDirAdj(other);
}
public boolean intersectsDirAdj(TextBoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsXDirAdj(other, xThreshold) && this.intersectsYDirAdj(other, yThreshold);
}
public boolean intersectsXDirAdj(TextBoundingBox other, float threshold) {
return this.getXDirAdj() - threshold <= other.getMaxXDirAdj() && this.getMaxXDirAdj() + threshold >= other.getXDirAdj();
}
public boolean intersectsXDirAdj(TextBoundingBox other) {
return this.getXDirAdj() <= other.getMaxXDirAdj() && this.getMaxXDirAdj() >= other.getXDirAdj();
}
public boolean intersectsYDirAdj(TextBoundingBox other) {
return this.getYDirAdj() <= other.getMaxYDirAdj() && this.getMaxYDirAdj() >= other.getYDirAdj();
}
public boolean intersectsYDirAdj(TextBoundingBox other, float threshold) {
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
}
public boolean isAboveDirAdj(TextBoundingBox other) {
return other.isBelow(this);
}
public boolean isBelowDirAdj(TextBoundingBox other) {
return this.intersectsXDirAdj(other) && this.getYDirAdj() >= other.getMaxYDirAdj();
}
}

View File

@ -28,10 +28,4 @@ public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
return setRep.values(); return setRep.values();
} }
public Collection<T> getElements() {
return getParentMap().keySet();
}
} }

View File

@ -6,11 +6,9 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode;
@Data @Data
@EqualsAndHashCode(callSuper = false) public class Zone extends BoundingBox {
public class Zone extends TextBoundingBox {
private List<Line> lines; private List<Line> lines;
@ -18,6 +16,7 @@ public class Zone extends TextBoundingBox {
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod") @SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
public Zone(List<Line> lines) { public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY0));
this.lines = lines; this.lines = lines;
setToBBoxOfComponents(lines); setToBBoxOfComponents(lines);
} }

View File

@ -17,7 +17,7 @@ public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5; private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67; private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
private static final double ANGLE_TOLERANCE = Math.toRadians(5); private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) { public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {

View File

@ -1,17 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.ListIterator; import java.util.ListIterator;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
@ -21,30 +19,21 @@ public class ReadingOrderService {
private static final double THRESHOLD = 5; private static final double THRESHOLD = 5;
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
private static final Comparator<TextBoundingBox> COMPARATOR = //
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = // public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
if (zones.isEmpty() || zones.size() == 1) { if (zones.isEmpty() || zones.size() == 1) {
return zones; return zones;
} }
if (xyReadingOrder) { if (xyReadingOrder) {
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords); return resolveSingleColumnReadingOrder(zones);
} }
Map<Long, Integer> histogram = new HashMap<>(); Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) { for (Zone zone : zones) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox(); long minY = Math.round(zone.getBBox().getMinY());
long minY = Math.round(bbox.getMinY()); long maxY = Math.round(zone.getBBox().getMaxY());
long maxY = Math.round(bbox.getMaxY());
for (long i = minY; i <= maxY; i++) { for (long i = minY; i <= maxY; i++) {
histogram.put(i, histogram.getOrDefault(i, 0) + 1); histogram.put(i, histogram.getOrDefault(i, 0) + 1);
} }
@ -54,32 +43,24 @@ public class ReadingOrderService {
.stream() .stream()
.mapToInt(Integer::intValue).average() .mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { .orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords); return resolveSingleColumnReadingOrder(zones);
} else { } else {
return resolveMultiColumnReadingOder(zones, useDirAdjCoords); return resolveMultiColumnReadingOder(zones);
} }
} }
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) { private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
if (useDirAdjCoords) { zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
return zones.stream() .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.collect(Collectors.groupingBy(TextBoundingBox::getDir)).values()
.stream()
.flatMap(words -> words.stream()
.sorted(COMPARATOR_DIR_ADJ))
.toList();
}
zones.sort(COMPARATOR);
return zones; return zones;
} }
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) { private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e // Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order // TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
@ -88,12 +69,11 @@ public class ReadingOrderService {
double maxX = Double.NEGATIVE_INFINITY; double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) { for (Zone zone : zones) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox(); if (zone.getX() < minX) {
if (bbox.getX() < minX) { minX = zone.getX();
minX = zone.getXDirAdj();
} }
if (bbox.getMaxX() > maxX) { if (zone.getX() + zone.getWidth() > maxX) {
maxX = zone.getMaxXDirAdj(); maxX = zone.getX() + zone.getWidth();
} }
} }
@ -102,27 +82,24 @@ public class ReadingOrderService {
List<Zone> leftOf = new ArrayList<>(); List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>(); List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>(); List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) { for (Zone zone : zones) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox(); if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
leftOf.add(zone); leftOf.add(zone);
} else if (bbox.getX() > midLineXCoordinate && bbox.getX() + bbox.getWidth() > midLineXCoordinate) { } else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
rightOf.add(zone); rightOf.add(zone);
} else { } else {
middle.add(zone); middle.add(zone);
} }
} }
if (useDirAdjCoords) { leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
leftOf.sort(COMPARATOR_DIR_ADJ); .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(COMPARATOR_DIR_ADJ);
middle.sort(COMPARATOR_DIR_ADJ); rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
} else { .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
leftOf.sort(COMPARATOR);
rightOf.sort(COMPARATOR); middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
middle.sort(COMPARATOR); .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
}
/* /*
List<Zone> leftNotIntersecting = new ArrayList<>(); List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) { for (Zone leftZone : leftOf) {
@ -174,9 +151,8 @@ public class ReadingOrderService {
while (itty.hasNext()) { while (itty.hasNext()) {
Zone current = itty.next(); Zone current = itty.next();
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
for (int i = 0; i < sortedZones.size(); i++) { for (int i = 0; i < sortedZones.size(); i++) {
if (bbox.getY() < sortedZones.get(i).getY()) { if (current.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current); sortedZones.add(i, current);
itty.remove(); itty.remove();
break; break;

View File

@ -1,7 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
@ -9,12 +9,11 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@Service @Service
public class ZoneBuilderService { public class ZoneBuilderService {
@ -22,7 +21,7 @@ public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5; private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2; private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -7; private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5; private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
@ -30,7 +29,7 @@ public class ZoneBuilderService {
private static final double MAX_LINE_SIZE_SCALE = 2.5; private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.toRadians(5); private static final double ANGLE_TOLERANCE = Math.PI / 6;
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5; private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
@ -39,7 +38,7 @@ public class ZoneBuilderService {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER; double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER; double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = lineSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER; double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER; double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines)); UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
@ -55,26 +54,11 @@ public class ZoneBuilderService {
return; return;
} }
// if (!innerLine.getFontStyle().equals(outerLine.getFontStyle()) // double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
// && !outerLine.intersectsY(innerLine, -2f)) { scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
// return;
// }
double horizontalScale = Math.min(outerLine.getHeightDirAdj(), innerLine.getHeightDirAdj()) / meanHeight; double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
horizontalScale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(horizontalScale, MAX_LINE_SIZE_SCALE)); double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
double verticalScale = horizontalScale;
// if (innerLine.toString().endsWith(":")
// || outerLine.toString().endsWith(":")
// || numericalIdentifierPattern.matcher(innerLine.toString()).matches()
// || numericalIdentifierPattern.matcher(outerLine.toString()).matches()) {
//
// horizontalScale *= 5;
// verticalScale /= 10;
// }
double horizontalDistance = outerLine.horizontalDistance(innerLine) / horizontalScale;
double verticalDistance = outerLine.verticalDistance(innerLine) / verticalScale;
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) // if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) { && (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
@ -103,7 +87,7 @@ public class ZoneBuilderService {
double weights = 0.0; double weights = 0.0;
for (Line line : lines) { for (Line line : lines) {
double weight = line.getLength(); double weight = line.getLength();
meanHeight += line.getHeightDirAdj() * weight; meanHeight += line.getHeight() * weight;
weights += weight; weights += weight;
} }
meanHeight /= weights; meanHeight /= weights;
@ -113,14 +97,64 @@ public class ZoneBuilderService {
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) { private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
Set<Word> words = lines.stream() double maxHorizontalDistance = 0;
.map(Line::getWords) double minVerticalDistance = 0;
.flatMap(Collection::stream) double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE;
.collect(Collectors.toSet());
Collection<Set<Word>> groupedLines = TextPositionOperations.groupByLine(words);
List<Line> sortedLines = TextPositionOperations.sortLines(groupedLines); UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
return new Zone(sortedLines);
lines.forEach(outer -> {
lines.forEach(inner -> {
if (inner == outer) {
return;
}
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
unionFind.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance
&& verticalDistance <= maxVerticalDistance
&& Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
}
}
if (!characterOverlap && overlappingCount <= 2) {
unionFind.union(outer, inner);
}
}
});
});
List<Line> outputZone = new ArrayList<>();
for (Set<Line> group : unionFind.getGroups()) {
List<Character> characters = new ArrayList<>();
for (Line line : group) {
characters.addAll(line.getCharacters());
}
characters.sort(Comparator.comparingDouble(Character::getX));
outputZone.add(new Line(characters, characterSpacing));
}
return new Zone(outputZone.stream()
.sorted(Comparator.comparing(Line::getY0))
.collect(Collectors.toList()));
} }
} }

View File

@ -7,12 +7,9 @@ public class DoubleUtils {
if (Double.isNaN(d1) || Double.isNaN(d2)) { if (Double.isNaN(d1) || Double.isNaN(d2)) {
return Double.compare(d1, d2); return Double.compare(d1, d2);
} }
long i1 = Math.round(d1 / (precision == 0 ? 1 : precision));
if (Math.abs(d1 - d2) < precision) { long i2 = Math.round(d2 / (precision == 0 ? 1 : precision));
return 0; return Long.compare(i1, i2);
}
return Double.compare(d1, d2);
} }
} }

View File

@ -4,7 +4,7 @@ import java.util.HashSet;
import java.util.Set; import java.util.Set;
import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;

View File

@ -4,10 +4,10 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
@ -25,12 +25,12 @@ public class ClassificationDocument {
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private LayoutDebugLayer layoutDebugLayer = new LayoutDebugLayer(); private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations();
private boolean headlines; private boolean headlines;
private long rulesVersion; private long rulesVersion;
private OutlineObjectTree outlineObjectTree; private OutlineObjectTree outlineObjectTree;
private SectionTree sectionTree; private TableOfContents tableOfContents;
} }

View File

@ -18,7 +18,6 @@ import lombok.RequiredArgsConstructor;
@Data @Data
@RequiredArgsConstructor @RequiredArgsConstructor
public class ClassificationPage { public class ClassificationPage {
@NonNull @NonNull
@ -26,7 +25,7 @@ public class ClassificationPage {
private List<OutlineObject> outlineObjects = new ArrayList<>(); private List<OutlineObject> outlineObjects = new ArrayList<>();
private List<AbstractPageBlock> headlines = new ArrayList<>(); private List<AbstractPageBlock> headlines = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>(); private List<ClassifiedImage> images = new ArrayList<>();
@ -45,7 +44,7 @@ public class ClassificationPage {
private float pageWidth; private float pageWidth;
private float pageHeight; private float pageHeight;
private CleanRulings cleanRulings; CleanRulings cleanRulings;
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>(); private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();

View File

@ -1,19 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.Map;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
public record DocumentWithVisualization(Document document, LayoutDebugLayer layoutDebugLayer) {
public Map<NodeType, Long> buildSemanticNodeCounts() {
return document.streamAllSubNodes()
.collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
}
}

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model; package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
@ -11,14 +12,10 @@ import lombok.Getter;
@Getter @Getter
public class FloatFrequencyCounter { public class FloatFrequencyCounter {
Map<Double, Integer> countPerValue = new HashMap<>(); Map<Float, Integer> countPerValue = new HashMap<>();
boolean changed;
Double mostPopularCache;
public void add(double value) { public void add(float value) {
changed = true;
if (!countPerValue.containsKey(value)) { if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1); countPerValue.put(value, 1);
@ -28,11 +25,9 @@ public class FloatFrequencyCounter {
} }
public void addAll(Map<Double, Integer> otherCounter) { public void addAll(Map<Float, Integer> otherCounter) {
changed = true; for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
for (Map.Entry<Double, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) { if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue()); countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else { } else {
@ -42,36 +37,36 @@ public class FloatFrequencyCounter {
} }
public Double getMostPopular() { public Float getMostPopular() {
if (changed || mostPopularCache == null) { Map.Entry<Float, Integer> mostPopular = null;
Map.Entry<Double, Integer> mostPopular = null; for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) { if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) { mostPopular = entry;
mostPopular = entry; }
} }
return mostPopular != null ? mostPopular.getKey() : null;
}
public List<Float> getHighterThanMostPopular() {
Float mostPopular = getMostPopular();
List<Float> higher = new ArrayList<>();
for (Float value : countPerValue.keySet()) {
if (value > mostPopular) {
higher.add(value);
} }
mostPopularCache = mostPopular != null ? mostPopular.getKey() : 0;
changed = false;
} }
return mostPopularCache; return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
} }
public List<Double> getValuesInReverseOrder() { public Float getHighest() {
return countPerValue.keySet() Float highest = null;
.stream() for (Float value : countPerValue.keySet()) {
.sorted(Collections.reverseOrder())
.collect(Collectors.toList());
}
public Double getHighest() {
Double highest = null;
for (Double value : countPerValue.keySet()) {
if (highest == null || value > highest) { if (highest == null || value > highest) {
highest = value; highest = value;
} }

View File

@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
public class LineInformation { public class LineInformation {
List<Rectangle2D> lineBBox; List<Rectangle2D> lineBBox;
List<List<Word>> sequencesByLines; List<List<TextPositionSequence>> sequencesByLines;
List<List<Rectangle2D>> bBoxWithGapsByLines; List<List<Rectangle2D>> bBoxWithGapsByLines;
List<List<List<Word>>> sequencesWithGapsByLines; List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
} }

View File

@ -9,14 +9,12 @@ public enum PageBlockType {
H6, H6,
HEADER, HEADER,
FOOTER, FOOTER,
TITLE,
PARAGRAPH, PARAGRAPH,
PARAGRAPH_BOLD, PARAGRAPH_BOLD,
PARAGRAPH_ITALIC, PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN, PARAGRAPH_UNKNOWN,
OTHER, OTHER,
TABLE_OF_CONTENTS_HEADLINE,
TABLE_OF_CONTENTS_ITEM,
LIST_ITEM,
TABLE; TABLE;
@ -36,7 +34,7 @@ public enum PageBlockType {
public static int getHeadlineNumber(PageBlockType pageBlockType) { public static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) { return switch (pageBlockType) {
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1; case H1 -> 1;
case H2 -> 2; case H2 -> 2;
case H3 -> 3; case H3 -> 3;
case H4 -> 4; case H4 -> 4;
@ -48,6 +46,6 @@ public enum PageBlockType {
public boolean isHeadline() { public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE); return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
} }
} }

View File

@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
@ -15,7 +15,7 @@ import lombok.Getter;
@AllArgsConstructor @AllArgsConstructor
public class PageContents { public class PageContents {
List<Word> sortedWords; List<TextPositionSequence> sortedTextPositionSequences;
Rectangle2D cropBox; Rectangle2D cropBox;
Rectangle2D mediaBox; Rectangle2D mediaBox;
List<Ruling> rulings; List<Ruling> rulings;

View File

@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.Collections; import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -16,13 +15,11 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier { public class SectionIdentifier {
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?"); static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
public enum Format { public enum Format {
EMPTY, EMPTY,
NUMERICAL, NUMERICAL,
ALPHANUMERIC,
DOCUMENT DOCUMENT
} }
@ -44,10 +41,6 @@ public class SectionIdentifier {
if (numericalIdentifierMatcher.find()) { if (numericalIdentifierMatcher.find()) {
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher); return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
} }
Matcher alphanumericIdentifierMatcher = alphanumericIdentifierPattern.matcher(headline);
if (alphanumericIdentifierMatcher.find()) {
return buildAlphanumericSectionIdentifier(headline, alphanumericIdentifierMatcher);
}
// more formats here // more formats here
return SectionIdentifier.empty(); return SectionIdentifier.empty();
} }
@ -82,36 +75,7 @@ public class SectionIdentifier {
} }
identifiers.add(Integer.parseInt(numericalIdentifier.trim())); identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
} }
return new SectionIdentifier(Format.NUMERICAL, return new SectionIdentifier(Format.NUMERICAL, identifierString, identifiers.stream().toList(), false);
identifierString,
identifiers.stream()
.toList(),
false);
}
private static SectionIdentifier buildAlphanumericSectionIdentifier(String headline, Matcher alphanumericIdentifierMatcher) {
String identifierString = headline.substring(alphanumericIdentifierMatcher.start(), alphanumericIdentifierMatcher.end());
String alphanumericIdentifier = alphanumericIdentifierMatcher.group(0).substring(0, 1).toUpperCase(Locale.ENGLISH);
int mappedCharacterValue = alphanumericIdentifier.charAt(0) - 'A' + 1;
List<Integer> identifiers = new LinkedList<>();
identifiers.add(mappedCharacterValue);
for (int i = 1; i <= 3; i++) {
String numericalIdentifier = alphanumericIdentifierMatcher.group(i);
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
break;
}
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
}
return new SectionIdentifier(Format.ALPHANUMERIC,
identifierString,
identifiers.stream()
.toList(),
false);
} }
@ -159,22 +123,4 @@ public class SectionIdentifier {
return identifierString; return identifierString;
} }
public boolean isEmpty() {
return this.format.equals(Format.EMPTY);
}
public int level() {
return identifiers.size();
}
protected List<Integer> getIdentifiers() {
return identifiers;
}
} }

View File

@ -0,0 +1,145 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import static java.lang.String.format;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import lombok.EqualsAndHashCode;
import lombok.Setter;
@Setter
@EqualsAndHashCode
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
public class Boundary implements Comparable<Boundary> {
private int start;
private int end;
public Boundary(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
this.start = start;
this.end = end;
}
public int length() {
return end - start;
}
public int start() {
return start;
}
public int end() {
return end;
}
public boolean contains(Boundary boundary) {
return start <= boundary.start() && boundary.end() <= end;
}
public boolean containedBy(Boundary boundary) {
return boundary.contains(this);
}
public boolean contains(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return this.start <= start && end <= this.end;
}
public boolean containedBy(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return start <= this.start && this.end <= end;
}
public boolean contains(int index) {
return start <= index && index < end;
}
public boolean intersects(Boundary boundary) {
return boundary.start() < this.end && this.start < boundary.end();
}
public List<Boundary> split(List<Integer> splitIndices) {
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
}
List<Boundary> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int splitIndex : splitIndices) {
// skip split if it would produce a boundary of length 0
if (splitIndex == previousIndex) {
continue;
}
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
previousIndex = splitIndex;
}
splitBoundaries.add(new Boundary(previousIndex, end));
return splitBoundaries;
}
public IntStream intStream() {
return IntStream.range(start, end);
}
public static Boundary merge(Collection<Boundary> boundaries) {
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
return new Boundary(minStart, maxEnd);
}
@Override
public String toString() {
return format("Boundary [%d|%d)", start, end);
}
@Override
public int compareTo(Boundary boundary) {
if (end < boundary.end() && start < boundary.start()) {
return -1;
}
if (start > boundary.start() && end > boundary.end()) {
return 1;
}
return 0;
}
}

View File

@ -0,0 +1,217 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import static java.lang.String.format;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@EqualsAndHashCode
public class DocumentTree {
private final Entry root;
public DocumentTree(Document document) {
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
}
public TextBlock buildTextBlock() {
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
}
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
if (!entryExists(parentId)) {
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children.size());
parent.children.add(Entry.builder().treeId(newId).node(node).build());
return newId;
}
private boolean entryExists(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root != null;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children.get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> treeId) {
return getEntryById(getParentId(treeId));
}
public boolean hasParentById(List<Integer> treeId) {
return !treeId.isEmpty();
}
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
return getEntryById(treeId).children.stream().map(Entry::getNode);
}
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
}
private static List<Integer> getParentId(List<Integer> treeId) {
if (treeId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (treeId.size() < 2) {
return Collections.emptyList();
}
return treeId.subList(0, treeId.size() - 1);
}
public Entry getEntryById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root;
for (int id : treeId) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<Entry> mainEntries() {
return root.children.stream();
}
public Stream<Entry> allEntriesInOrder() {
return Stream.of(root).flatMap(DocumentTree::flatten);
}
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
}
@Override
public String toString() {
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
}
public SemanticNode getHighestParentById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root.node;
}
return root.children.get(treeId.get(0)).node;
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public static class Entry {
List<Integer> treeId;
SemanticNode node;
@Builder.Default
List<Entry> children = new LinkedList<>();
@Override
public String toString() {
return node.toString();
}
public NodeType getType() {
return node.getType();
}
}
}

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
public enum EntityType {
ENTITY,
RECOMMENDATION,
FALSE_POSITIVE,
FALSE_RECOMMENDATION
}

View File

@ -0,0 +1,228 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class RedactionEntity {
// initial values
@EqualsAndHashCode.Include
final Boundary boundary;
@EqualsAndHashCode.Include
final String type;
@EqualsAndHashCode.Include
final EntityType entityType;
// empty defaults
boolean redaction;
boolean removed;
boolean ignored;
boolean resized;
boolean skipRemoveEntitiesContainedInLarger;
boolean dictionaryEntry;
boolean dossierDictionaryEntry;
Set<Engine> engines;
Set<RedactionEntity> references;
@Builder.Default
Deque<Integer> matchedRules = new LinkedList<>();
String redactionReason;
String legalBasis;
// inferred on graph insertion
@EqualsAndHashCode.Include
String value;
String textBefore;
String textAfter;
@Builder.Default
Set<Page> pages = new HashSet<>();
List<RedactionPosition> redactionPositionsPerPage;
@Builder.Default
List<SemanticNode> intersectingNodes = new LinkedList<>();
SemanticNode deepestFullyContainingNode;
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
}
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
return intersectingNodes.stream().anyMatch(clazz::isInstance);
}
public boolean occursInNode(SemanticNode semanticNode) {
return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode));
}
public boolean isType(String type) {
return this.type.equals(type);
}
public boolean isAnyType(List<String> types) {
return types.contains(type);
}
public void addIntersectingNode(SemanticNode containingNode) {
intersectingNodes.add(containingNode);
}
public void removeFromGraph() {
intersectingNodes.forEach(node -> node.getEntities().remove(this));
pages.forEach(page -> page.getEntities().remove(this));
intersectingNodes = new LinkedList<>();
deepestFullyContainingNode = null;
pages = new HashSet<>();
removed = true;
ignored = true;
}
public void addMatchedRule(int ruleNumber) {
matchedRules.add(ruleNumber);
}
public int getMatchedRule() {
if (matchedRules.isEmpty()) {
return 0;
}
return matchedRules.getLast();
}
public List<RedactionPosition> getRedactionPositionsPerPage() {
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
Page firstPage = rectanglesPerLinePerPage.keySet()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
}
return redactionPositionsPerPage;
}
private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
if (entry.getKey().equals(firstPage)) {
return new RedactionPosition(id, entry.getKey(), entry.getValue());
} else {
return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
}
}
public boolean containedBy(RedactionEntity redactionEntity) {
return this.boundary.containedBy(redactionEntity.getBoundary());
}
public boolean contains(RedactionEntity redactionEntity) {
return this.boundary.contains(redactionEntity.getBoundary());
}
public boolean intersects(RedactionEntity redactionEntity) {
return this.boundary.intersects(redactionEntity.getBoundary());
}
public void addEngine(Engine engine) {
engines.add(engine);
}
public void addEngines(Set<Engine> engines) {
this.engines.addAll(engines);
}
public void addReference(RedactionEntity reference) {
references.add(reference);
}
public void addReferences(List<RedactionEntity> references) {
this.references.addAll(references);
}
public boolean matchesAnnotationId(String manualRedactionId) {
return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Entity[\"");
sb.append(value);
sb.append("\", ");
sb.append(boundary);
sb.append(", pages[");
pages.forEach(page -> {
sb.append(page.getNumber());
sb.append(", ");
});
sb.delete(sb.length() - 2, sb.length());
sb.append("], type = \"");
sb.append(type);
sb.append("\", EntityType.");
sb.append(entityType);
sb.append("]");
return sb.toString();
}
}

View File

@ -0,0 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class RedactionPosition {
final String id;
Page page;
// Each entry in this list corresponds to an entry in the redaction log, this means:
// An entity might be represented by multiple redaction log entries
List<Rectangle2D> rectanglePerLine;
}

View File

@ -0,0 +1,74 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public abstract class AbstractSemanticNode implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + getType() + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -0,0 +1,166 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Document extends AbstractSemanticNode {
Set<Page> pages;
Integer numberOfPages;
LayoutparsingVisualizations visualizations;
@Override
public NodeType getType() {
return NodeType.DOCUMENT;
}
/**
* Gets the sections of the document as a list.
*
* @return A list of all sections within the document.
*/
public List<Section> getAllSections() {
return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the main sections of the document as a list.
*
* @return A list of main sections within the document
* @deprecated This method is marked for removal.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
*/
@Deprecated(forRemoval = true)
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects.
*
* @return A list of all children of type SECTION or SUPER_SECTION.
*/
public List<SemanticNode> getChildrenOfTypeSectionOrSuperSection() {
return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION))
.toList();
}
public List<Header> getHeaders() {
return streamChildrenOfType(NodeType.HEADER).map(node -> (Header) node)
.collect(Collectors.toList());
}
public List<Footer> getFooters() {
return streamChildrenOfType(NodeType.FOOTER).map(node -> (Footer) node)
.collect(Collectors.toList());
}
@Override
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElse(Headline.builder().build());
}
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock);
}
@Override
public List<Integer> getTreeId() {
return Collections.emptyList();
}
@Override
public void setTreeId(List<Integer> tocId) {
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
}
private Stream<SemanticNode> streamAllNodes() {
return getDocumentTree().allEntriesInOrder()
.map(DocumentTree.Entry::getNode);
}
public Stream<Image> streamAllImages() {
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
}
public Map<NodeType, Long> buildSemanticNodeCounts() {
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
}
@Override
public String toString() {
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBox = new HashMap<>();
for (Page page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
}

View File

@ -0,0 +1,35 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@EqualsAndHashCode(callSuper = true)
@SuperBuilder
public class DuplicatedParagraph extends Paragraph {
TextBlock unsortedLeafTextBlock;
@Override
public TextBlock getTextBlock() {
return Stream.of(super.getLeafTextBlock(), unsortedLeafTextBlock)
.collect(new TextBlockCollector());
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Footer extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.FOOTER;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
public interface GenericSemanticNode extends SemanticNode {
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Header extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public boolean isLeaf() {
return true;
}
@Override
public NodeType getType() {
return NodeType.HEADER;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,57 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Headline extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.HEADLINE;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
}
@Override
public Headline getHeadline() {
return this;
}
}

View File

@ -0,0 +1,95 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Image extends AbstractSemanticNode {
String id;
String representationHash;
ImageType imageType;
boolean transparent;
Rectangle2D position;
TextBlock leafTextBlock;
boolean redaction;
boolean ignored;
@Builder.Default
String redactionReason = "";
@Builder.Default
String legalBasis = "";
@Builder.Default
int matchedRule = -1;
@EqualsAndHashCode.Exclude
Page page;
@Override
public NodeType getType() {
return NodeType.IMAGE;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public Set<Page> getPages() {
return Collections.singleton(page);
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
bBoxPerPage.put(page, position);
return bBoxPerPage;
}
@Override
public boolean isLeaf() {
return true;
}
}

View File

@ -0,0 +1,27 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.Locale;
public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
SIGNATURE_VISUAL,
OTHER,
OCR,
GRAPHIC;
public static ImageType fromString(String imageType) {
return switch (imageType.toLowerCase(Locale.ROOT)) {
case "logo" -> ImageType.LOGO;
case "formula" -> ImageType.FORMULA;
case "signature" -> ImageType.SIGNATURE;
case "ocr" -> ImageType.OCR;
case "graphic" -> ImageType.GRAPHIC;
default -> ImageType.OTHER;
};
}
}

View File

@ -0,0 +1,87 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@Setter
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Page {
Integer number;
Integer height;
Integer width;
Integer rotation;
@EqualsAndHashCode.Exclude
List<SemanticNode> mainBody;
@EqualsAndHashCode.Exclude
Header header;
@EqualsAndHashCode.Exclude
Footer footer;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Builder.Default
@EqualsAndHashCode.Exclude
Set<Image> images = new HashSet<>();
public static Page fromClassificationPage(ClassificationPage classificationPage) {
return Page.builder()
.height((int) classificationPage.getPageHeight())
.width((int) classificationPage.getPageWidth())
.number(classificationPage.getPageNumber())
.rotation(classificationPage.getRotation())
.mainBody(new LinkedList<>())
.build();
}
public TextBlock getMainBodyTextBlock() {
return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return String.valueOf(number);
}
@Override
public int hashCode() {
return number;
}
@Override
public boolean equals(Object o) {
return o instanceof Page && o.hashCode() == this.hashCode();
}
}

View File

@ -0,0 +1,43 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PROTECTED)
public class Paragraph extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.PARAGRAPH;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
}

View File

@ -0,0 +1,47 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class Section extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.SECTION;
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(() -> getParent().getHeadline());
}
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -0,0 +1,481 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
public interface SemanticNode {
/**
* Returns the type of this node, such as Section, Paragraph, etc.
*
* @return NodeType of this node
*/
NodeType getType();
/**
* Searches all Nodes located underneath this Node in the DocumentTree and concatenates their AtomicTextBlocks into a single TextBlock.
* So, for a Section all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlock
* If the Node is a Leaf, the LeafTextBlock will be returned instead.
*
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
*/
default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
}
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose boundary intersects the boundary of this node.
*
* @return Set of all Entities associated with this Node
*/
Set<RedactionEntity> getEntities();
/**
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages() {
return getTextBlock().getPages();
}
default Page getFirstPage() {
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
}
/**
* Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages(Boundary boundary) {
if (!getBoundary().contains(boundary)) {
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary()));
}
return getTextBlock().getPages(boundary);
}
default boolean isOnPage(int pageNumber) {
return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber);
}
/**
* Returns the DocumentTree Object.
*
* @return the DocumentTree of the Document this node belongs to
*/
DocumentTree getDocumentTree();
/**
* The id is a List of Integers uniquely identifying this node in the DocumentTree.
*
* @return the DocumentTree ID
*/
List<Integer> getTreeId();
/**
* This should only be used during graph construction.
*
* @param tocId List of Integers
*/
void setTreeId(List<Integer> tocId);
/**
* Traverses the Tree up, until it hits a Headline or hits a Section which will then return the first Headline from its children.
* Throws NotFoundException if no Headline is found this way
*
* @return First Headline found
*/
default Headline getHeadline() {
return getParent().getHeadline();
}
/**
* Checks if its TocId has a length greater than zero.
*
* @return boolean indicating whether this Node has a Parent in the DocumentTree
*/
default boolean hasParent() {
return getDocumentTree().hasParentById(getTreeId());
}
/**
* @return The SemanticNode representing the Parent in the DocumentTree
* throws NotFoundException, when no parent is present
*/
default SemanticNode getParent() {
return getDocumentTree().getParentEntryById(getTreeId()).getNode();
}
/**
* @return The SemanticNode which is directly underneath the document and also under which this node is.
* if this is the highest child node or the document itself, it returns itself.
*/
default SemanticNode getHighestParent() {
return getDocumentTree().getHighestParentById(getTreeId());
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections, Images, and Tables are not leaves.
* A TableCell might be a leaf depending on its area compared to the page.
*
* @return boolean, indicating if a Node has direct access to a TextBlock
*/
default boolean isLeaf() {
return false;
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections and Tables are no leaves.
*
* @return AtomicTextBlock
*/
default TextBlock getLeafTextBlock() {
throw new UnsupportedOperationException("Only leaf Nodes have access to LeafTextBlocks!");
}
/**
* Should only be used during construction of the Graph. Sets the LeafTextBlock of this SemanticNode.
*
* @param textBlock the TextBlock to set as the LeafTextBlock of this SemanticNode
*/
default void setLeafTextBlock(TextBlock textBlock) {
throw new UnsupportedOperationException();
}
/**
* Checks whether this SemanticNode has any Entity with EntityType.ENTITY of the provided type.
*
* @param type string representing the type of entity to check for
* @return true, if this SemanticNode has at least one Entity of the provided type
*/
default boolean hasEntitiesOfType(String type) {
return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
}
/**
* Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author".
*
* @param type string representing the type of entities to return
* @return List of RedactionEntities of any the type
*/
default List<RedactionEntity> getEntitiesOfType(String type) {
return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList();
}
/**
* Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author".
*
* @param types A list of strings representing the types of entities to return
* @return List of RedactionEntities of any provided type
*/
default List<RedactionEntity> getEntitiesOfType(List<String> types) {
return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList();
}
/**
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
*
* @return Integer representing the number on the page
*/
default Integer getNumberOnPage() {
TextBlock textBlock = getTextBlock();
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
} else {
return -1;
}
}
/**
* Checks if the SemanticNode contains any text.
*
* @return true, if this node's TextBlock is not empty
*/
default boolean hasText() {
return !getTextBlock().isEmpty();
}
/**
* Checks whether this SemanticNode contains the provided String.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string
*/
default boolean containsString(String string) {
return getTextBlock().getSearchText().contains(string);
}
/**
* Checks whether this SemanticNode contains all the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all strings
*/
default boolean containsStrings(List<String> strings) {
return strings.stream().allMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains all the provided Strings ignoring case.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string ignoring case
*/
default boolean containsStringIgnoreCase(String string) {
return getTextBlock().getSearchText().toLowerCase(Locale.ROOT).contains(string.toLowerCase(Locale.ROOT));
}
/**
* Checks whether this SemanticNode contains any of the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyString(List<String> strings) {
return strings.stream().anyMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyStringIgnoreCase(List<String> strings) {
return strings.stream().anyMatch(this::containsStringIgnoreCase);
}
/**
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity.
* It sets the fields accordingly and recursively calls this function on all its children.
*
* @param redactionEntity RedactionEntity, which is being inserted into the graph
*/
default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {
TextBlock textBlock = getTextBlock();
if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) {
if (textBlock.containsBoundary(redactionEntity.getBoundary())) {
redactionEntity.setDeepestFullyContainingNode(this);
}
redactionEntity.addIntersectingNode(this);
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary()))
.forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
}
}
/**
* returns the set of layoutengines.
*
* @return set of layoutengines.
*/
Set<LayoutEngine> getEngines();
/**
* adds a layoutengine to the set.
*/
default void addEngine(LayoutEngine engine) {
getEngines().add(engine);
}
/**
* Streams all children located directly underneath this node in the DocumentTree.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildren() {
return getDocumentTree().childNodes(getTreeId());
}
/**
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildrenOfType(NodeType nodeType) {
return getDocumentTree().childNodesOfType(getTreeId(), nodeType);
}
/**
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode);
}
/**
* Recursively streams all SemanticNodes of the provided type located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode);
}
/**
* The Boundary is the start and end string offsets in the reading order of the document.
*
* @return Boundary of this Node's TextBlock
*/
default Boundary getBoundary() {
return getTextBlock().getBoundary();
}
/**
* If this Node is a Leaf it will calculate the boundingBox of its LeafTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
* If called on the Document, it will return the cropbox of each page
*
* @return Rectangle2D fully encapsulating this Node for each page.
*/
default Map<Page, Rectangle2D> getBBox() {
if (isLeaf()) {
return getBBoxFromLeafTextBlock();
}
return getBBoxFromChildren();
}
/**
* Checks whether the Bounding Box of this SemanticNode contains the provided rectangle on the provided page.
*
* @param rectangle2D The rectangle to check if it is contained
* @param pageNumber The Page number on which the rectangle should be checked
* @return boolean
*/
default boolean containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) {
Page helperPage = Page.builder().number(pageNumber).build();
if (!getPages().contains(helperPage)) {
return false;
}
return getBBox().get(helperPage).contains(rectangle2D);
}
/**
* TODO: this produces unwanted results for sections spanning multiple columns.
* Computes the Union of the bounding boxes of all children recursively.
*
* @return The union of the BoundingBoxes of all children
*/
private Map<Page, Rectangle2D> getBBoxFromChildren() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList();
Set<Page> pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet());
for (Page page : pages) {
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
.map(childBboxPerPage -> childBboxPerPage.get(page))
.collect(RectangleTransformations.collectBBox());
bBoxPerPage.put(page, bBoxOnPage);
}
return bBoxPerPage;
}
/**
* @return The union of all BoundingBoxes of the TextBlock of this node
*/
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
return bBoxPerPage;
}
}

View File

@ -0,0 +1,40 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class SuperSection extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.SUPER_SECTION;
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(() -> getParent().getHeadline());
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.SUPER_SECTION + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -0,0 +1,356 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Table implements SemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
DocumentTree documentTree;
int numberOfRows;
int numberOfCols;
TextBlock textBlock;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
/**
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
*
* @param strings Strings to check whether a row contains them
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
*/
public Stream<RedactionEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
return IntStream.range(0, numberOfRows).boxed()
.filter(row -> rowContainsStringsIgnoreCase(row, strings))
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Checks whether the specified row contains all the provided strings.
*
* @param row the row to check as an Integer, must be smaller than numberOfRows
* @param strings a list of strings to check for
* @return true, if all strings appear in the provided row
*/
public boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings) {
String rowText = streamRow(row).map(TableCell::getTextBlock)
.collect(new TextBlockCollector()).getSearchText().toLowerCase(Locale.ROOT);
return strings.stream()
.map(String::toLowerCase)
.allMatch(rowText::contains);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
*
* @param header the header value to search for
* @param value the string which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
List<Integer> vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> vertebrateStudyCols.stream()
.anyMatch(vertebrateStudyCol -> getCell(tableCellNode.getRow(), vertebrateStudyCol).containsString(value)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
*
* @param header the header value to search for
* @param values the strings which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
List<Integer> colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> colsWithHeader.stream()
.anyMatch(colWithHeader -> getCell(tableCellNode.getRow(), colWithHeader).containsAnyString(values)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*
* @param types type strings to check whether a row contains an entity like them
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types) {
List<Integer> rowsWithEntityOfType = IntStream.range(0, numberOfRows).boxed()
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).anyMatch(existingType -> types.stream()
.anyMatch(typeToCheck -> typeToCheck.equals(existingType))))
.toList();
return rowsWithEntityOfType.stream()
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
*
* @param types type strings to check whether a row doesn't contain an entity like it
* @return Stream of all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types) {
List<Integer> rowsWithNoEntityOfType = IntStream.range(0, numberOfRows).boxed()
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).noneMatch(existingType -> types.stream()
.anyMatch(typeToCheck -> typeToCheck.equals(existingType))))
.toList();
return rowsWithNoEntityOfType.stream()
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
private Stream<String> streamEntityTypesInRow(Integer rowNumber) {
return streamRow(rowNumber).map(TableCell::getEntities)
.flatMap(Collection::stream)
.map(RedactionEntity::getType)
.distinct();
}
/**
* Returns a TableCell at the provided row and column location.
*
* @param row int representing the row, must be smaller than numberOfRows
* @param col int representing the col, must be smaller than numberOfCols
* @return TableCell at the provided location in the table
*/
public TableCell getCell(int row, int col) {
if (numberOfRows - row < 0 || numberOfCols - col < 0) {
throw new IllegalArgumentException(format("row %d, col %d is out of bounds for number of rows of %d and number of cols %d", row, col, numberOfRows, numberOfCols));
}
int idx = row * numberOfCols + col;
return (TableCell) documentTree.getEntryById(treeId).getChildren()
.get(idx).getNode();
}
/**
* Streams all TableCells in this Table row-wise.
*
* @return Stream of all TableCells
*/
public Stream<TableCell> streamTableCells() {
return streamChildrenOfType(NodeType.TABLE_CELL).map(node -> (TableCell) node);
}
/**
* Streams all TableCells in this Table which have the provided header row-wise.
*
* @return Stream of all TableCells which have the provided header
*/
public Stream<TableCell> streamTableCellsWithHeader(String header) {
return streamHeaders().filter(tableCellNode -> tableCellNode.getTextBlock().getSearchText().contains(header))
.map(TableCell::getCol)
.flatMap(this::streamCol)
.filter(tableCellNode -> !tableCellNode.isHeader());
}
/**
* Streams all TableCells belonging to the provided column from top down.
*
* @param col int representing the column
* @return Stream of all TableCell in the provided column
*/
public Stream<TableCell> streamCol(int col) {
return IntStream.range(0, numberOfRows).boxed()
.map(row -> getCell(row, col));
}
/**
* Streams all TableCells belonging to the provided row from left to right.
*
* @param row int representing the row
* @return Stream of all TableCell in the provided row
*/
public Stream<TableCell> streamRow(int row) {
return IntStream.range(0, numberOfCols).boxed()
.map(col -> getCell(row, col));
}
/**
* Streams all TableCells row-wise and filters them with header == true.
*
* @return Stream of all TableCells with header == true
*/
public Stream<TableCell> streamHeaders() {
return streamTableCells().filter(TableCell::isHeader);
}
/**
* Streams all TableCells of the provided row and column and filters them with header == true.
*
* @param row int representing the row
* @param col int representing the column
* @return Stream of all TableCells with header == true in the provided row or col
*/
public Stream<TableCell> streamHeadersForCell(int row, int col) {
return Stream.concat(streamRow(row), streamCol(col))
.filter(TableCell::isHeader);
}
/**
* Streams all Headers and checks if any equal the provided string.
*
* @param header string to check the headers for
* @return true, if at least one header equals the provided string
*/
public boolean hasHeader(String header) {
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock().getSearchText().strip().equals(header));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
*
* @param header string to find header cells
* @param value string to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
*/
public boolean hasRowWithHeaderAndValue(String header, String value) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsString(value));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*
* @param header string to find header cells
* @param values List of strings to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*/
public boolean hasRowWithHeaderAndAnyValue(String header, List<String> values) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values));
}
/**
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
*
* @param type the type of entities to search for
* @param redactionEntity the entity, which appears in the row to search
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
*/
public List<RedactionEntity> getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity) {
return redactionEntity.getIntersectingNodes()
.stream()
.filter(node -> node instanceof TableCell)
.map(node -> (TableCell) node)
.flatMap(tableCellNode -> streamRow(tableCellNode.getRow()))
.map(cell -> cell.getEntitiesOfType(type))
.flatMap(Collection::stream)
.toList();
}
@Override
public NodeType getType() {
return NodeType.TABLE;
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = SemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = SemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -0,0 +1,87 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableCell extends AbstractSemanticNode {
int row;
int col;
boolean header;
Rectangle2D bBox;
TextBlock leafTextBlock;
TextBlock textBlock;
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
return bBoxPerPage;
}
@Override
public NodeType getType() {
return NodeType.TABLE_CELL;
}
@Override
public boolean isLeaf() {
return getDocumentTree().getEntryById(getTreeId()).getChildren().isEmpty();
}
@Override
public TextBlock getTextBlock() {
if (isLeaf()) {
return leafTextBlock;
}
if (textBlock == null) {
textBlock = buildTextBlock();
}
return textBlock;
}
private TextBlock buildTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -0,0 +1,232 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class AtomicTextBlock implements TextBlock {
Long id;
Integer numberOnPage;
Page page;
//string coordinates
Boundary boundary;
String searchText;
@Builder.Default
List<Integer> lineBreaks = new ArrayList<>();
@Builder.Default
List<Boundary> boldTextBoundaries = new ArrayList<>();
@Builder.Default
List<Boundary> italicTextBoundaries = new ArrayList<>();
String orientation;
int textDirection;
//position coordinates
@Builder.Default
List<Integer> stringIdxToPositionIdx = new ArrayList<>();
@Builder.Default
List<Rectangle2D> positions = new ArrayList<>();
@EqualsAndHashCode.Exclude
SemanticNode parent;
@Override
public int numberOfLines() {
return lineBreaks.size() + 1;
}
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
List<Integer> lineBreaks,
List<Boundary> boldTextBoundaries,
List<Boundary> italicTextBoundaries,
List<Rectangle2D> positions,
List<Integer> stringIdxToPositionIdx,
long idx,
SemanticNode parent,
int numberOnPage,
Page page,
int offset,
String orientation,
int textDirection) {
return AtomicTextBlock.builder()
.id(idx)
.parent(parent)
.searchText(searchText)
.numberOnPage(numberOnPage)
.page(page)
.lineBreaks(lineBreaks)
.boldTextBoundaries(boldTextBoundaries)
.italicTextBoundaries(italicTextBoundaries)
.positions(positions)
.stringIdxToPositionIdx(stringIdxToPositionIdx)
.boundary(new Boundary(offset, offset + searchText.length()))
.textDirection(textDirection)
.orientation(orientation)
.build();
}
public static AtomicTextBlock empty(Long textBlockIdx, int stringOffset, Page page, int numberOnPage, SemanticNode parent) {
return AtomicTextBlock.builder()
.id(textBlockIdx)
.boundary(new Boundary(stringOffset, stringOffset))
.searchText("")
.page(page)
.numberOnPage(numberOnPage)
.parent(parent)
.build();
}
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
DocumentPositionData documentPositionData,
SemanticNode parent,
Page page) {
return AtomicTextBlock.builder()
.id(documentTextData.getId())
.numberOnPage(documentTextData.getNumberOnPage())
.page(page)
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
.searchText(documentTextData.getSearchText())
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
.positions(toRectangle2DList(documentPositionData.getPositions()))
.parent(parent)
.build();
}
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
}
public CharSequence getLine(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
}
if (lineNumber == 0) {
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
} else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
}
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
}
@Override
public List<AtomicTextBlock> getAtomicTextBlocks() {
return List.of(this);
}
@Override
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
.findFirst() //
.orElse(searchText.length()) + boundary.start();
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
.reduce((a, b) -> b)//
.orElse(0) + boundary.start();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
if (!containsBoundary(stringBoundary)) {
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary));
}
if (stringBoundary.length() == 0) {
return Collections.emptyList();
}
int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start());
if (stringBoundary.end() == this.boundary.end()) {
return positions.subList(startPositionIdx, positions.size());
}
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
}
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
.stream()
.map(this::getPositions)
.map(RectangleTransformations::rectangleBBoxWithGaps)
.flatMap(Collection::stream)
.toList();
Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>();
rectanglePerLinePerPage.put(page, rectanglesPerLine);
return rectanglePerLinePerPage;
}
private List<Integer> getAllLineBreaksInBoundary(Boundary boundary) {
return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList();
}
@Override
public String toString() {
return searchText;
}
}

View File

@ -0,0 +1,222 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ConcatenatedTextBlock implements TextBlock {
List<AtomicTextBlock> atomicTextBlocks;
String searchText;
Boundary boundary;
public static ConcatenatedTextBlock empty() {
return new ConcatenatedTextBlock(Collections.emptyList());
}
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
this.atomicTextBlocks = new LinkedList<>();
if (atomicTextBlocks.isEmpty()) {
boundary = new Boundary(-1, -1);
return;
}
var firstTextBlock = atomicTextBlocks.get(0);
this.atomicTextBlocks.add(firstTextBlock);
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
}
public ConcatenatedTextBlock concat(TextBlock textBlock) {
int start = textBlock.getBoundary().start();
int end = textBlock.getBoundary().end();
if (this.atomicTextBlocks.isEmpty()) {
boundary.setStart(start);
boundary.setEnd(end);
} else if (boundary.end() != start) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
boundary.setEnd(end);
this.searchText = null;
return this;
}
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
}
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
}
@Override
public String getSearchText() {
if (searchText == null) {
StringBuilder sb = new StringBuilder();
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
searchText = sb.toString();
}
return searchText;
}
@Override
public int numberOfLines() {
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
}
@Override
public int getNextLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
}
@Override
public List<Integer> getLineBreaks() {
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositions(stringBoundary);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getPositions());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
return positions;
}
@Override
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositionsPerPage(stringBoundary);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary()));
}
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
return rectanglesPerLinePerPage;
}
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList()));
return mergedMap;
}
@Override
public String toString() {
return getSearchText();
}
@Override
public List<Boundary> getBoldTextBoundaries() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
}
@Override
public List<Boundary> getItalicTextBoundaries() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
}
@Override
public String getOrientation() {
if (atomicTextBlocks.isEmpty()) {
return "";
}
return atomicTextBlocks.get(0).getOrientation();
}
@Override
public int getTextDirection() {
if (atomicTextBlocks.isEmpty()) {
return 0;
}
return atomicTextBlocks.get(0).getTextDirection();
}
}

View File

@ -0,0 +1,148 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
public interface TextBlock extends CharSequence {
String getSearchText();
List<AtomicTextBlock> getAtomicTextBlocks();
List<Boundary> getBoldTextBoundaries();
List<Boundary> getItalicTextBoundaries();
String getOrientation();
int getTextDirection();
Boundary getBoundary();
int getNextLinebreak(int fromIndex);
int getPreviousLinebreak(int fromIndex);
List<Integer> getLineBreaks();
Rectangle2D getPosition(int stringIdx);
List<Rectangle2D> getPositions(Boundary stringBoundary);
Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary);
int numberOfLines();
default int indexOf(String searchTerm) {
return indexOf(searchTerm, getBoundary().start());
}
default Set<Page> getPages() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
}
default Set<Page> getPages(Boundary boundary) {
return getAtomicTextBlocks().stream()
.filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary))
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
default int indexOf(String searchTerm, int startOffset) {
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
if (start == -1) {
return -1;
}
return start + getBoundary().start();
}
default CharSequence getFirstLine() {
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
}
default boolean containsBoundary(Boundary boundary) {
if (boundary.end() < boundary.start()) {
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
}
return getBoundary().contains(boundary);
}
default boolean containsIndex(int stringIndex) {
return getBoundary().contains(stringIndex);
}
default CharSequence subSequence(Boundary boundary) {
return subSequence(boundary.start(), boundary.end());
}
default String buildSummary() {
String[] words = getSearchText().split(" ");
int bound = Math.min(words.length, 4);
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
return String.join(" ", list);
}
@Override
default CharSequence subSequence(int start, int end) {
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
}
@Override
default int length() {
return getBoundary().length();
}
@Override
default char charAt(int index) {
return getSearchText().charAt(index - getBoundary().start());
}
}

View File

@ -0,0 +1,49 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import lombok.NoArgsConstructor;
@NoArgsConstructor
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
@Override
public Supplier<ConcatenatedTextBlock> supplier() {
return ConcatenatedTextBlock::empty;
}
@Override
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
return ConcatenatedTextBlock::concat;
}
@Override
public BinaryOperator<ConcatenatedTextBlock> combiner() {
return ConcatenatedTextBlock::concat;
}
@Override
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
return a -> a;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
}
}

View File

@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.image;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
@ -10,8 +10,8 @@ import lombok.NonNull;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@Data @Data
@AllArgsConstructor
@RequiredArgsConstructor @RequiredArgsConstructor
@AllArgsConstructor
public class ClassifiedImage { public class ClassifiedImage {
@NonNull @NonNull
@ -20,18 +20,11 @@ public class ClassifiedImage {
private ImageType imageType; private ImageType imageType;
private boolean sourceByAi; private boolean sourceByAi;
private boolean isAppendedToSection; private boolean isAppendedToSection;
@NonNull
private boolean hasTransparency; private boolean hasTransparency;
@NonNull
private int page; private int page;
@NonNull
private String representation; private String representation;
public ClassifiedImage(@NonNull Rectangle2D position, @NonNull ImageType imageType, boolean hasTransparency, int page, String representation) {
this.position = position;
this.imageType = imageType;
this.hasTransparency = hasTransparency;
this.page = page;
this.representation = representation;
}
} }

View File

@ -1,6 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline; package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -27,9 +26,6 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocume
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -86,20 +82,11 @@ public class OutlineExtractorService {
String title = item.getTitle(); String title = item.getTitle();
PDPage page; PDPage page = item.findDestinationPage(document);
try { if (page == null) {
// Can throw: "Error: can't convert to Destination COSArray" for some OCR'd PDFs
page = item.findDestinationPage(document);
if (page == null) {
return Optional.empty();
}
} catch (IOException e) {
log.info(String.format("Error occurred during position resolution for outline item with title %s: " + e, title));
return Optional.empty(); return Optional.empty();
} }
int pageNumber = document.getPages().indexOf(page);
int pageNumber = document.getPages().indexOf(page) + 1;
AffineTransform userSpaceToPageCoords = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(PageInformation.fromPDPage(pageNumber, page));
Optional<Point2D> outlinePosition = Optional.empty(); Optional<Point2D> outlinePosition = Optional.empty();
@ -128,15 +115,8 @@ public class OutlineExtractorService {
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title)); log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
} }
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth)));
pageNumber,
transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
}
private static Point2D transformPointToPageCoords(Optional<Point2D> outlinePosition, AffineTransform userSpaceToPageCoords) {
return outlinePosition.map(point -> userSpaceToPageCoords.transform(point, null)).orElse(null);
} }

View File

@ -1,34 +1,27 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline; package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.util.Optional;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.Getter; import lombok.RequiredArgsConstructor;
import lombok.Setter;
@Data
@RequiredArgsConstructor
@AllArgsConstructor
public class OutlineObject { public class OutlineObject {
@Getter
private final String title; private final String title;
@Getter
private final int pageNumber; private final int pageNumber;
@Getter private Point2D point;
private final int treeDepth; private final int treeDepth;
private Point2D point; // java coordinates, (0, 0) is always top left
@Getter
@Setter
private boolean found; private boolean found;
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) { public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
this.title = title; this(title, pageNumber, depth);
this.pageNumber = pageNumber;
this.treeDepth = depth;
this.point = point2D; this.point = point2D;
} }
@ -39,39 +32,4 @@ public class OutlineObject {
return "OutlineObject{" + "title='" + title + '\'' + '}'; return "OutlineObject{" + "title='" + title + '\'' + '}';
} }
public Optional<Point2D> getPoint() {
return Optional.ofNullable(point);
}
public boolean isAbove(BoundingBox boundingBox) {
if (point == null) {
return true;
}
return point.getY() <= boundingBox.getMaxY();
}
public double distance(BoundingBox boundingBox) {
if (point == null) {
return 0;
}
if (boundingBox.getBBox().contains(point)) {
return 0;
}
double deltaX = Math.min(Math.abs(boundingBox.getMinX() - point.getX()), Math.abs(boundingBox.getMaxX() - point.getX()));
double deltaY = Math.min(Math.abs(boundingBox.getMinY() - point.getY()), Math.abs(boundingBox.getMaxY() - point.getY()));
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
}
public void resetPoint() {
this.point = null;
}
} }

View File

@ -39,28 +39,4 @@ public class OutlineObjectTree {
} }
} }
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("OutlineObjectTree(\n");
for (OutlineObjectTreeNode node : rootNodes) {
buildString(node, sb, 1);
}
sb.append(")");
return sb.toString();
}
private void buildString(OutlineObjectTreeNode node, StringBuilder sb, int depth) {
for (int i = 0; i < depth; i++) {
sb.append(" ");
}
sb.append(node.getOutlineObject().getTitle()).append("\n");
for (OutlineObjectTreeNode child : node.getChildren()) {
buildString(child, sb, depth + 1);
}
}
} }

View File

@ -0,0 +1,61 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineValidationService {
public TableOfContents createToC(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
TableOfContentItem last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new TableOfContentItem(current);
if (parentDepth == null) {
mainSections.add(tocItem);
lastItemsPerDepth = new HashMap<>();
depths = new TreeSet<>();
} else {
assert last != null;
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
}
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
last = tocItem;
lastItemsPerDepth.put(currentDepth, tocItem);
depths.add(currentDepth);
}
return new TableOfContents(mainSections);
}
}

View File

@ -1,82 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import io.micrometer.observation.annotation.Observed;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class SectionTreeBuilderService {
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
List<SectionTreeEntry> mainSections = new ArrayList<>();
Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
SectionTreeEntry last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new SectionTreeEntry(current);
if (parentDepth == null) {
mainSections.add(tocItem);
lastItemsPerDepth = new HashMap<>();
depths = new TreeSet<>();
} else {
assert last != null;
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
// headline after toc should always start a main section
parentDepth = 1;
} else if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
}
SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
last = tocItem;
lastItemsPerDepth.put(currentDepth, tocItem);
depths.add(currentDepth);
}
return new SectionTree(mainSections);
}
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
return classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
}
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline; package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -23,28 +22,28 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@Service @Service
public class SectionTreeEnhancementService { public class TOCEnrichmentService {
public void assignSectionBlocksAndImages(ClassificationDocument document) { public void assignSectionBlocksAndImages(ClassificationDocument document) {
SectionTree toc = document.getSectionTree(); TableOfContents toc = document.getTableOfContents();
Iterator<SectionTreeEntry> iterator = toc.iterator(); Iterator<TableOfContentItem> iterator = toc.iterator();
SectionTreeEntry currentTOCItem = null; TableOfContentItem currentTOCItem = null;
if (iterator.hasNext()) { if (iterator.hasNext()) {
currentTOCItem = iterator.next(); currentTOCItem = iterator.next();
} }
List<AbstractPageBlock> startBlocks = new ArrayList<>(); List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>(); List<ClassifiedImage> startImages = new ArrayList<>();
SectionTreeEntry currentSection = null; TableOfContentItem currentSection = null;
boolean foundFirstHeadline = false; boolean foundFirstHeadline = false;
List<ClassificationHeader> headers = new ArrayList<>(); List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>(); List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null; TablePageBlock previousTable = null;
List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>(); List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>();
for (ClassificationPage page : document.getPages()) { for (ClassificationPage page : document.getPages()) {
List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>(); List<TableOfContentItem> currentPageTOCItems = new ArrayList<>();
List<TextPageBlock> header = new ArrayList<>(); List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>(); List<TextPageBlock> footer = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) { for (AbstractPageBlock current : page.getTextBlocks()) {
@ -101,7 +100,7 @@ public class SectionTreeEnhancementService {
Double xMax = null; Double xMax = null;
Double yMax = null; Double yMax = null;
for (SectionTreeEntry tocItem : lastFoundTOCItems) { for (TableOfContentItem tocItem : lastFoundTOCItems) {
var headline = tocItem.getHeadline(); var headline = tocItem.getHeadline();
if (headline.getPage() != page.getPageNumber()) { if (headline.getPage() != page.getPageNumber()) {
@ -168,11 +167,11 @@ public class SectionTreeEnhancementService {
} }
} }
if (!startBlocks.isEmpty() || !startImages.isEmpty()) { if (!startBlocks.isEmpty()) {
SectionTreeEntry unassigned = new SectionTreeEntry(null); TableOfContentItem unassigned = new TableOfContentItem(null);
unassigned.setSectionBlocks(startBlocks); unassigned.setSectionBlocks(startBlocks);
unassigned.setImages(startImages); unassigned.setImages(startImages);
document.getSectionTree().getMainSections().add(0, unassigned); document.getTableOfContents().getMainSections().add(0, unassigned);
} }
document.setHeaders(headers); document.setHeaders(headers);
document.setFooters(footers); document.setFooters(footers);
@ -186,8 +185,12 @@ public class SectionTreeEnhancementService {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row // Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { if (previousTableNonHeaderRow.isEmpty()
previousTableNonHeaderRow = previousTable.getRows().get(0) && previousTable.getRowCount() == 1
&& previousTable.getRows()
.get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows()
.get(0)
.stream() .stream()
.map(cell -> { .map(cell -> {
Cell fakeCell = Cell.copy(cell); Cell fakeCell = Cell.copy(cell);
@ -198,7 +201,8 @@ public class SectionTreeEnhancementService {
} }
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i); List<Cell> row = currentTable.getRows()
.get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream() if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) { .allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) { for (int j = 0; j < row.size(); j++) {
@ -221,15 +225,18 @@ public class SectionTreeEnhancementService {
return table.getRows() return table.getRows()
.stream() .stream()
.flatMap(Collection::stream) .flatMap(row -> row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty()); .filter(cell -> !cell.getHeaderCells().isEmpty()))
.findAny().isEmpty();
} }
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) { private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i); List<Cell> row = table.getRows()
.get(i);
if (row.size() == 1) { if (row.size() == 1) {
continue; continue;
} }

View File

@ -2,12 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -16,68 +14,52 @@ import lombok.EqualsAndHashCode;
@Data @Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true) @EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class SectionTreeEntry { public class TableOfContentItem {
public enum Type {
SECTION,
SUPER_SECTION,
TOC_SECTION
}
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
private TextPageBlock headline; private TextPageBlock headline;
private List<SectionTreeEntry> children = new ArrayList<>(); private List<TableOfContentItem> children = new ArrayList<>();
private SectionTreeEntry parent; private TableOfContentItem parent;
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>(); private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>(); private List<ClassifiedImage> images = new ArrayList<>();
private GenericSemanticNode section; private AbstractSemanticNode section;
public SectionTreeEntry(TextPageBlock headline) { public TableOfContentItem(TextPageBlock headline) {
this.headline = headline; this.headline = headline;
} }
public Type getType() { public void addChild(TableOfContentItem tableOfContentItem) {
if (!Objects.isNull(headline) && headline.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_HEADLINE)) { children.add(tableOfContentItem);
return Type.TOC_SECTION; tableOfContentItem.setParent(this);
}
if (children.isEmpty()) {
return Type.SECTION;
}
return Type.SUPER_SECTION;
} }
public void addChild(SectionTreeEntry sectionTreeEntry) { public TableOfContentItem getSiblingBefore() {
children.add(sectionTreeEntry);
sectionTreeEntry.setParent(this);
}
public SectionTreeEntry getSiblingBefore() {
if (parent != null) { if (parent != null) {
int index = parent.getChildren().indexOf(this); int index = parent.getChildren().indexOf(this);
if (index > 0) { if (index > 0) {
return parent.getChildren().get(index - 1); return parent.getChildren()
.get(index - 1);
} }
} }
return null; return null;
} }
public SectionTreeEntry getSiblingAfter() { public TableOfContentItem getSiblingAfter() {
if (parent != null) { if (parent != null) {
int index = parent.getChildren().indexOf(this); int index = parent.getChildren().indexOf(this);
if (index >= 0 && index < parent.getChildren().size() - 1) { if (index >= 0 && index < parent.getChildren().size() - 1) {
return parent.getChildren().get(index + 1); return parent.getChildren()
.get(index + 1);
} }
} }
return null; return null;
@ -89,7 +71,7 @@ public class SectionTreeEntry {
if (headline.equals(block)) { if (headline.equals(block)) {
return true; return true;
} }
for (SectionTreeEntry child : children) { for (TableOfContentItem child : children) {
if (child.contains(block)) { if (child.contains(block)) {
return true; return true;
} }
@ -98,12 +80,12 @@ public class SectionTreeEntry {
} }
public boolean contains(SectionTreeEntry tocItem) { public boolean contains(TableOfContentItem tocItem) {
if (this.equals(tocItem)) { if (this.equals(tocItem)) {
return true; return true;
} }
for (SectionTreeEntry child : children) { for (TableOfContentItem child : children) {
if (child.contains(tocItem)) { if (child.contains(tocItem)) {
return true; return true;
} }
@ -111,19 +93,17 @@ public class SectionTreeEntry {
return false; return false;
} }
public List<AbstractPageBlock> getNonEmptySectionBlocks() { public List<AbstractPageBlock> getNonEmptySectionBlocks() {
return sectionBlocks.stream() return sectionBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList());
.filter(pageBlock -> !pageBlock.isEmpty())
.collect(Collectors.toList());
} }
@Override @Override
public String toString() { public String toString() {
return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}'; return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}';
} }
} }

View File

@ -14,12 +14,12 @@ import lombok.RequiredArgsConstructor;
@Data @Data
@RequiredArgsConstructor @RequiredArgsConstructor
public class SectionTree implements Iterable<SectionTreeEntry> { public class TableOfContents implements Iterable<TableOfContentItem> {
private List<SectionTreeEntry> mainSections = new ArrayList<>(); private List<TableOfContentItem> mainSections = new ArrayList<>();
public SectionTree(List<SectionTreeEntry> mainSections) { public TableOfContents(List<TableOfContentItem> mainSections) {
this.mainSections = mainSections; this.mainSections = mainSections;
} }
@ -28,36 +28,36 @@ public class SectionTree implements Iterable<SectionTreeEntry> {
public List<TextPageBlock> getAllTextPageBlocks() { public List<TextPageBlock> getAllTextPageBlocks() {
List<TextPageBlock> allTextPageBlocks = new ArrayList<>(); List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
for (SectionTreeEntry item : mainSections) { for (TableOfContentItem item : mainSections) {
collectTextPageBlocks(item, allTextPageBlocks); collectTextPageBlocks(item, allTextPageBlocks);
} }
return allTextPageBlocks; return allTextPageBlocks;
} }
private void collectTextPageBlocks(SectionTreeEntry item, List<TextPageBlock> textPageBlocks) { private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
textPageBlocks.add(item.getHeadline()); textPageBlocks.add(item.getHeadline());
for (SectionTreeEntry child : item.getChildren()) { for (TableOfContentItem child : item.getChildren()) {
collectTextPageBlocks(child, textPageBlocks); collectTextPageBlocks(child, textPageBlocks);
} }
} }
public List<SectionTreeEntry> getAllTableOfContentItems() { public List<TableOfContentItem> getAllTableOfContentItems() {
List<SectionTreeEntry> allItems = new ArrayList<>(); List<TableOfContentItem> allItems = new ArrayList<>();
for (SectionTreeEntry item : mainSections) { for (TableOfContentItem item : mainSections) {
collectTableOfContentItems(item, allItems); collectTableOfContentItems(item, allItems);
} }
return allItems; return allItems;
} }
private void collectTableOfContentItems(SectionTreeEntry item, List<SectionTreeEntry> allItems) { private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
allItems.add(item); allItems.add(item);
for (SectionTreeEntry child : item.getChildren()) { for (TableOfContentItem child : item.getChildren()) {
collectTableOfContentItems(child, allItems); collectTableOfContentItems(child, allItems);
} }
} }
@ -65,7 +65,7 @@ public class SectionTree implements Iterable<SectionTreeEntry> {
private boolean containsBlock(TextPageBlock block) { private boolean containsBlock(TextPageBlock block) {
for (SectionTreeEntry existingItem : this.getMainSections()) { for (TableOfContentItem existingItem : this.getMainSections()) {
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) { if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
return true; return true;
} }
@ -74,9 +74,9 @@ public class SectionTree implements Iterable<SectionTreeEntry> {
} }
private boolean containsItem(SectionTreeEntry tocItem) { private boolean containsItem(TableOfContentItem tocItem) {
for (SectionTreeEntry existingItem : this.getMainSections()) { for (TableOfContentItem existingItem : this.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) { if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true; return true;
} }
@ -86,18 +86,18 @@ public class SectionTree implements Iterable<SectionTreeEntry> {
@Override @Override
public @NonNull Iterator<SectionTreeEntry> iterator() { public @NonNull Iterator<TableOfContentItem> iterator() {
return new SectionTreeEntryIterator(mainSections); return new TableOfContentItemIterator(mainSections);
} }
private static class SectionTreeEntryIterator implements Iterator<SectionTreeEntry> { private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
private final Stack<Iterator<SectionTreeEntry>> stack = new Stack<>(); private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
SectionTreeEntryIterator(List<SectionTreeEntry> mainSections) { TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
stack.push(mainSections.iterator()); stack.push(mainSections.iterator());
} }
@ -112,10 +112,10 @@ public class SectionTree implements Iterable<SectionTreeEntry> {
@Override @Override
public SectionTreeEntry next() { public TableOfContentItem next() {
ensureStackTopIsCurrent(); ensureStackTopIsCurrent();
SectionTreeEntry currentItem = stack.peek().next(); TableOfContentItem currentItem = stack.peek().next();
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) { if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
stack.push(currentItem.getChildren() stack.push(currentItem.getChildren()
.iterator()); .iterator());

View File

@ -9,7 +9,7 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Data; import lombok.Data;
@ -35,14 +35,14 @@ public class Cell extends BoundingBox {
public Cell(Point2D topLeft, Point2D bottomRight) { public Cell(Point2D topLeft, Point2D bottomRight) {
this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY())); this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
this.bBox = bBoxPdf; this.bBox = bBoxInitialUserSpace;
} }
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) { public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
this.bBoxPdf = bBoxInitialUserSpace; this.bBoxInitialUserSpace = bBoxInitialUserSpace;
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D(); this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
} }
@ -50,7 +50,7 @@ public class Cell extends BoundingBox {
public static Cell copy(Cell cell) { public static Cell copy(Cell cell) {
Cell copy = new Cell(); Cell copy = new Cell();
copy.bBoxPdf = cell.bBoxPdf; copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace;
copy.bBox = cell.bBox; copy.bBox = cell.bBox;
return copy; return copy;
} }
@ -68,12 +68,12 @@ public class Cell extends BoundingBox {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
Iterator<TextPageBlock> itty = textBlocks.iterator(); Iterator<TextPageBlock> itty = textBlocks.iterator();
Word previous = null; TextPositionSequence previous = null;
while (itty.hasNext()) { while (itty.hasNext()) {
TextPageBlock textBlock = itty.next(); TextPageBlock textBlock = itty.next();
for (Word word : textBlock.getWords()) { for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) { if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n'); sb.append('\n');
@ -87,7 +87,7 @@ public class Cell extends BoundingBox {
} }
return TextNormalizationUtilities.cleanString(sb.toString()); return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " ");
} }

View File

@ -70,7 +70,7 @@ public class CleanRulings {
public boolean lineBetween(BoundingBox a, BoundingBox b) { public boolean lineBetween(BoundingBox a, BoundingBox b) {
return lineBetween(a.getBBoxPdf(), b.getBBoxPdf()); return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace());
} }

View File

@ -263,8 +263,8 @@ public class TablePageBlock extends AbstractPageBlock {
cells.stream() cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell, .map(originalCell -> new CellWithIntersection(originalCell,
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(), RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(),
originalCell.getBBoxPdf()))) originalCell.getBBoxInitialUserSpace())))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0) .filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD) .filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea)) .max(Comparator.comparing(CellWithIntersection::intersectedArea))

View File

@ -1,8 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
public record AbstractBlockOnPage(AbstractPageBlock block, ClassificationPage page) {
}

View File

@ -1,9 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
public enum FontStyle {
REGULAR,
BOLD,
ITALIC,
BOLD_ITALIC;
}

View File

@ -1,21 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Getter
@NoArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class FrequencyCounters {
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
}

View File

@ -1,107 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ListIdentifier {
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]{1,4})\\.\\s+");
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]{1,4})\\)\\s+");
enum Format {
NUMBER_WITH_DOT,
NUMBER_IN_PARENTHESES
}
Format format;
@Getter
Word word;
@Getter
int page;
int representation;
public static Optional<ListIdentifier> parse(TextPageBlock textPageBlock, int page) {
return parse(textPageBlock.getWords().subList(0, Math.min(5, textPageBlock.getWords().size())), page);
}
public static Optional<ListIdentifier> parse(List<Word> sequences, int page) {
StringBuilder sb = new StringBuilder();
for (Word sequence : sequences) {
sb.append(sequence.toString());
sb.append(" ");
}
sb.replace(sb.length() - 1, sb.length(), "");
String text = sb.toString();
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
if (numberMatcher.find()) {
Optional<Integer> representation = parseInteger(numberMatcher.group(1));
if (representation.isPresent()) {
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, representation.get()));
}
}
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
if (parenthesisMatcher.find()) {
Optional<Integer> representation = parseInteger(parenthesisMatcher.group(1));
if (representation.isPresent()) {
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, representation.get()));
}
}
return Optional.empty();
}
private static Optional<Integer> parseInteger(String text) {
try {
return Optional.of(Integer.parseInt(text));
} catch (NumberFormatException e) {
return Optional.empty();
}
}
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
if (listIdentifiers.size() <= 1) {
return true;
}
for (int i = 1; i < listIdentifiers.size(); i++) {
ListIdentifier current = listIdentifiers.get(i);
ListIdentifier previous = listIdentifiers.get(i - 1);
if (current.format != previous.format) {
return false;
}
if (current.representation <= previous.representation) {
return false;
}
if (!current.word.intersectsXDirAdj(previous.word, 2)) {
return false;
}
if (current.page == previous.page && !current.word.isBelowDirAdj(previous.word)) {
return false;
}
if (current.page < previous.page) {
return false;
}
}
return true;
}
}

View File

@ -5,52 +5,64 @@ import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.AccessLevel;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data @Data
@SuperBuilder @Builder
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor @AllArgsConstructor
@EqualsAndHashCode(callSuper = true) public class RedTextPosition extends BoundingBox {
@FieldDefaults(level = AccessLevel.PRIVATE)
public class RedTextPosition extends TextBoundingBox {
public final static int HEIGHT_PADDING = 2; public final static int HEIGHT_PADDING = 2;
String unicode; private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation
// estimated using the TextMatrix in radians @JsonIgnore
float exactDir; private int rotation;
float widthOfSpace; @JsonIgnore
private float pageHeight;
float fontSizeInPt; @JsonIgnore
private float pageWidth;
String fontName; private String unicode;
@JsonIgnore
private float dir;
// not used in reanalysis
@JsonIgnore
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
private String fontName;
@SneakyThrows @SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) { public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition(); var pos = new RedTextPosition();
pos.setRotation(textPosition.getRotation());
pos.setPageHeight(textPosition.getPageHeight());
pos.setPageWidth(textPosition.getPageWidth());
pos.setUnicode(textPosition.getUnicode()); pos.setUnicode(textPosition.getUnicode());
pos.setDir(textPosition.getDir());
pos.setWidthOfSpace(textPosition.getWidthOfSpace()); pos.setWidthOfSpace(textPosition.getWidthOfSpace());
pos.setFontSizeInPt(textPosition.getFontSizeInPt()); pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName()); pos.setFontName(textPosition.getFont().getName());
pos.setExactDir((float) FastAtan2.fastAtan2(textPosition.getTextMatrix().getShearY(), textPosition.getTextMatrix().getScaleX()));
pos.setDir(TextDirection.fromDegrees(textPosition.getDir()));
//TODO: There is a mismatch in the java coords of the text and the rulings, //TODO: There is a mismatch in the java coords of the text and the rulings,
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work. // I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
@ -61,18 +73,18 @@ public class RedTextPosition extends TextBoundingBox {
textPosition.getYDirAdj() - textHeight, textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(), textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING); textHeight + HEIGHT_PADDING);
pos.setBBoxDirAdj(dirAdjPosition); pos.setBBoxDirAdj(dirAdjPosition);
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight()); AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D(); Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct pos.setBBoxInitialUserSpace(bBoxInitialUserSpace); // These are definitely correct
return pos; return pos;
} }
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) { private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
AffineTransform transform = new AffineTransform(); AffineTransform transform = new AffineTransform();
@ -91,4 +103,32 @@ public class RedTextPosition extends TextBoundingBox {
return transform; return transform;
} }
@JsonIgnore
public float getXDirAdj() {
return this.bBoxDirAdj.x;
}
@JsonIgnore
public float getYDirAdj() {
return this.bBoxDirAdj.y;
}
@JsonIgnore
public float getWidthDirAdj() {
return this.bBoxDirAdj.width;
}
@JsonIgnore
public float getHeightDir() {
return this.bBoxDirAdj.height;
}
} }

View File

@ -2,7 +2,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Getter; import lombok.Getter;
@ -10,18 +9,18 @@ import lombok.Getter;
@Getter @Getter
public class SearchableText { public class SearchableText {
private final List<Word> sequences = new ArrayList<>(); private final List<TextPositionSequence> sequences = new ArrayList<>();
public void add(Word word) { public void add(TextPositionSequence textPositionSequence) {
sequences.add(word); sequences.add(textPositionSequence);
} }
public void addAll(List<Word> words) { public void addAll(List<TextPositionSequence> textPositionSequences) {
sequences.addAll(words); sequences.addAll(textPositionSequences);
} }
@ -32,14 +31,18 @@ public class SearchableText {
} }
public static String buildString(List<Word> sequences) { public static String buildString(List<TextPositionSequence> sequences) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (Word word : sequences) { for (TextPositionSequence word : sequences) {
sb.append(word); sb.append(word);
sb.append(' '); sb.append(' ');
} }
return TextNormalizationUtilities.cleanString(sb.toString()); String text = sb.toString();
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
text = TextNormalizationUtilities.removeLineBreaks(text);
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
return text;
} }
} }

View File

@ -9,14 +9,10 @@ public class StringFrequencyCounter {
@Getter @Getter
private final Map<String, Integer> countPerValue = new HashMap<>(); private final Map<String, Integer> countPerValue = new HashMap<>();
boolean changed;
String mostPopularCache;
public void add(String value) { public void add(String value) {
changed = true;
if (!countPerValue.containsKey(value)) { if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1); countPerValue.put(value, 1);
} else { } else {
@ -27,8 +23,6 @@ public class StringFrequencyCounter {
public void addAll(Map<String, Integer> otherCounter) { public void addAll(Map<String, Integer> otherCounter) {
changed = true;
for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) { for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) { if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue()); countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
@ -41,18 +35,13 @@ public class StringFrequencyCounter {
public String getMostPopular() { public String getMostPopular() {
if (changed || mostPopularCache == null) { Map.Entry<String, Integer> mostPopular = null;
Map.Entry<String, Integer> mostPopular = null; for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) { if (mostPopular == null || entry.getValue() > mostPopular.getValue()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) { mostPopular = entry;
mostPopular = entry;
}
} }
mostPopularCache = mostPopular != null ? mostPopular.getKey() : null;
changed = false;
} }
return mostPopular != null ? mostPopular.getKey() : null;
return mostPopularCache;
} }
} }

View File

@ -1,7 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
public record TextBlockOnPage(TextPageBlock textBlock, ClassificationPage page) {
}

View File

@ -44,15 +44,4 @@ public enum TextDirection {
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees)); throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
} }
public int getRotation() {
return switch (this) {
case ZERO -> 0;
case QUARTER_CIRCLE -> 1;
case HALF_CIRCLE -> 2;
case THREE_QUARTER_CIRCLE -> 3;
};
}
} }

View File

@ -2,13 +2,12 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
@ -25,59 +24,62 @@ import lombok.NoArgsConstructor;
public class TextPageBlock extends AbstractPageBlock { public class TextPageBlock extends AbstractPageBlock {
@Builder.Default @Builder.Default
private List<Word> words = new ArrayList<>(); private List<TextPositionSequence> sequences = new ArrayList<>();
@Builder.Default
private FrequencyCounters frequencyCounters = new FrequencyCounters();
private Rectangle2D bBoxDirAdj; private String mostPopularWordFont;
private boolean underlined; private String mostPopularWordStyle;
private float mostPopularWordFontSize;
private float mostPopularWordHeight;
private float mostPopularWordSpaceWidth;
private float highestFontSize;
private PageBlockType classification; private PageBlockType classification;
private boolean toDuplicate; private boolean toDuplicate;
private String text;
private boolean changed;
public TextPageBlock(List<TextPositionSequence> sequences) {
public TextPageBlock(List<Word> words) { this.sequences = sequences;
calculateFrequencyCounters();
this.words = new ArrayList<>(words);
this.frequencyCounters = new FrequencyCounters();
if (!words.isEmpty()) {
addToFrequencyCounters(words);
}
calculateBBox(); calculateBBox();
} }
public List<Word> getWords() { @JsonIgnore
public TextDirection getDir() {
return Collections.unmodifiableList(words); return sequences.get(0).getDir();
} }
public TextDirection getDir() { @JsonIgnore
public float getPageHeight() {
return words.get(0).getDir(); return sequences.get(0).getPageHeight();
}
@JsonIgnore
public float getPageWidth() {
return sequences.get(0).getPageWidth();
} }
private void calculateBBox() { private void calculateBBox() {
if (words == null) { if (sequences == null) {
this.bBox = new Rectangle2D.Double(); this.bBox = new Rectangle2D.Double();
this.bBoxPdf = new Rectangle2D.Double(); this.bBoxInitialUserSpace = new Rectangle2D.Double();
this.bBoxDirAdj = new Rectangle2D.Double();
return; return;
} }
this.bBoxDirAdj = words.stream() setToBBoxOfComponents(sequences);
.map(Word::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(words);
} }
@ -99,8 +101,8 @@ public class TextPageBlock extends AbstractPageBlock {
throw new IllegalArgumentException("Cannot merge textBlocks on different pages."); throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
} }
List<Word> sequences = textBlocksToMerge.stream() List<TextPositionSequence> sequences = textBlocksToMerge.stream()
.map(TextPageBlock::getWords) .map(TextPageBlock::getSequences)
.flatMap(java.util.Collection::stream) .flatMap(java.util.Collection::stream)
.toList(); .toList();
sequences = new ArrayList<>(sequences); sequences = new ArrayList<>(sequences);
@ -109,27 +111,38 @@ public class TextPageBlock extends AbstractPageBlock {
} }
private void addToFrequencyCounters(List<Word> sequences) { private void calculateFrequencyCounters() {
for (Word wordBlock : sequences) { FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : sequences) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
frequencyCounters.getLineHeightFrequencyCounter().add(wordBlock.getTextHeight());
frequencyCounters.getFontSizeFrequencyCounter().add(wordBlock.getFontSize());
frequencyCounters.getSpaceFrequencyCounter().add(wordBlock.getSpaceWidth());
frequencyCounters.getFontFrequencyCounter().add(wordBlock.getFont());
frequencyCounters.getStyleFrequencyCounter().add(wordBlock.getFontStyle());
} }
setUnderlined(this.words.stream() setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
.allMatch(Word::isUnderline)); setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
setHighestFontSize(fontSizeFrequencyCounter.getHighest());
} }
public TextPageBlock union(Word r) { public TextPageBlock union(TextPositionSequence r) {
TextPageBlock union = this.copy(); TextPageBlock union = this.copy();
union.add(r); union.getSequences().add(r);
addToFrequencyCounters(List.of(r)); calculateFrequencyCounters();
calculateBBox(); calculateBBox();
return union; return union;
} }
@ -138,50 +151,51 @@ public class TextPageBlock extends AbstractPageBlock {
public TextPageBlock union(TextPageBlock r) { public TextPageBlock union(TextPageBlock r) {
TextPageBlock union = this.copy(); TextPageBlock union = this.copy();
union.addAll(r.getWords()); union.getSequences().addAll(r.getSequences());
addToFrequencyCounters(r.getWords()); calculateFrequencyCounters();
calculateBBox(); calculateBBox();
return union; return union;
} }
public void add(TextPageBlock textPageBlock) { public void add(TextPageBlock r) {
changed = true; sequences.addAll(r.getSequences());
words.addAll(textPageBlock.getWords()); calculateFrequencyCounters();
addToFrequencyCounters(textPageBlock.getWords());
calculateBBox(); calculateBBox();
} }
public void add(Word word) { public void add(TextPositionSequence r) {
changed = true; sequences.add(r);
words.add(word); calculateFrequencyCounters();
addToFrequencyCounters(List.of(word));
calculateBBox();
}
public void addAll(List<Word> words) {
changed = true;
this.words.addAll(words);
addToFrequencyCounters(words);
calculateBBox(); calculateBBox();
} }
public TextPageBlock copy() { public TextPageBlock copy() {
return new TextPageBlock(new ArrayList<>(words)); return new TextPageBlock(new ArrayList<>(sequences));
} }
@Override @Override
public String toString() { public String toString() {
return getText(); StringBuilder builder = new StringBuilder();
for (int i = 0; i < sequences.size(); i++) {
String sequenceAsString = sequences.get(i).toString();
// Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
builder.append(' ');
}
builder.append(sequenceAsString);
}
return builder.toString();
} }
@ -189,36 +203,30 @@ public class TextPageBlock extends AbstractPageBlock {
@JsonIgnore @JsonIgnore
public String getText() { public String getText() {
if (text == null || changed) { StringBuilder sb = new StringBuilder();
StringBuilder sb = new StringBuilder(); TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
Word previous = null; if (previous != null) {
for (Word word : words) { if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
if (previous != null) { sb.append('\n');
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { } else {
sb.append('\n'); sb.append(' ');
} else {
sb.append(' ');
}
} }
sb.append(word.toString());
previous = word;
} }
sb.append(word.toString());
text = TextNormalizationUtilities.removeHyphenLinebreaks(sb.toString()); previous = word;
changed = false;
} }
return text; return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
} }
public int getNumberOfLines() { public int getNumberOfLines() {
int numberOfLines = 1; int numberOfLines = 1;
Word previous = null; TextPositionSequence previous = null;
for (Word word : words) { for (TextPositionSequence word : sequences) {
if (previous != null) { if (previous != null) {
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) { if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) {
numberOfLines++; numberOfLines++;
@ -230,47 +238,10 @@ public class TextPageBlock extends AbstractPageBlock {
} }
public String getMostPopularWordFont() {
return frequencyCounters.getFontFrequencyCounter().getMostPopular();
}
public String getMostPopularWordStyle() {
return frequencyCounters.getStyleFrequencyCounter().getMostPopular();
}
public double getMostPopularWordFontSize() {
return frequencyCounters.getFontSizeFrequencyCounter().getMostPopular();
}
public double getMostPopularWordHeight() {
return frequencyCounters.getLineHeightFrequencyCounter().getMostPopular();
}
public double getMostPopularWordSpaceWidth() {
return frequencyCounters.getSpaceFrequencyCounter().getMostPopular();
}
public double getHighestFontSize() {
Double highest = frequencyCounters.getFontSizeFrequencyCounter().getHighest();
return highest == null ? 0 : highest;
}
@Override @Override
public boolean isEmpty() { public boolean isEmpty() {
return words.isEmpty(); return sequences.isEmpty();
} }
} }

View File

@ -0,0 +1,271 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class TextPositionSequence extends BoundingBox implements CharSequence {
public static final int HEIGHT_PADDING = 2;
@EqualsAndHashCode.Include
private int page;
@EqualsAndHashCode.Include
private List<RedTextPosition> textPositions = new ArrayList<>();
private Rectangle2D bBoxDirAdj;
@EqualsAndHashCode.Include
private TextDirection dir;
private int rotation;
private float pageHeight;
private float pageWidth;
private boolean isParagraphStart;
private boolean strikethrough;
private boolean underline;
public TextPositionSequence(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
this.textPositions = textPositions.stream()
.map(RedTextPosition::fromTextPosition)
.collect(Collectors.toList());
this.page = pageNumber;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart;
calculateBBox();
}
private void calculateBBox() {
this.bBoxDirAdj = textPositions.stream()
.map(RedTextPosition::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(getTextPositions());
}
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
calculateBBox();
}
@Override
public int length() {
return textPositions.size();
}
@Override
public char charAt(int index) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
public char charAt(int index, boolean caseInSensitive) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase(Locale.ROOT).charAt(0) : text.charAt(0);
}
@Override
public TextPositionSequence subSequence(int start, int end) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions.subList(start, end);
textPositionSequence.page = page;
textPositionSequence.dir = dir;
textPositionSequence.rotation = rotation;
textPositionSequence.pageHeight = pageHeight;
textPositionSequence.pageWidth = pageWidth;
textPositionSequence.setToBBoxOfComponents(getTextPositions());
return textPositionSequence;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder(length());
for (int i = 0; i < length(); i++) {
builder.append(charAt(i));
}
return builder.toString();
}
public RedTextPosition textPositionAt(int index) {
return textPositions.get(index);
}
public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) {
this.textPositions.add(textPosition);
this.page = textPositionSequence.getPage();
this.dir = textPositionSequence.getDir();
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
calculateBBox();
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
calculateBBox();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minX value
*/
public float getMinXDirAdj() {
return textPositions.get(0).getXDirAdj();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxX value
*/
public float getMaxXDirAdj() {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
*/
public float getMinYDirAdj() {
return textPositions.get(0).getYDirAdj() - getTextHeight();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
*/
public float getMaxYDirAdj() {
return textPositions.get(0).getYDirAdj();
}
public float getTextHeightNoPadding() {
return textPositions.get(0).getHeightDir();
}
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
}
public String getFont() {
if (textPositions.get(0).getFontName() == null) {
return "none";
}
return textPositions.get(0).getFontName().toLowerCase(Locale.ROOT).replaceAll(",bold", "").replaceAll(",italic", "");
}
public String getFontStyle() {
if (textPositions.get(0).getFontName() == null) {
return "standard";
}
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT);
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";
} else if (lowercaseFontName.contains("bold")) {
return "bold";
} else if (lowercaseFontName.contains("italic")) {
return "italic";
} else {
return "standard";
}
}
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
}

View File

@ -1,36 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.Comparator;
import java.util.HashMap;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
public class TocNumberComparator implements Comparator<NumberWord> {
private HashMap<NumberWord, TextBlockOnPage> lookup;
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
this.lookup = lookup;
}
@Override
public int compare(NumberWord number1, NumberWord number2) {
int page1 = lookup.get(number1).page().getPageNumber();
int page2 = lookup.get(number2).page().getPageNumber();
if (page1 != page2) {
return Integer.compare(page1, page2);
}
if (number1.word().getY() != number2.word().getY()) {
return Double.compare(number1.word().getY(), number2.word().getY());
}
return Integer.compare(number1.number(), number2.number());
}
}

View File

@ -1,272 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@SuppressWarnings("pmd")
public class Word extends TextBoundingBox implements CharSequence {
public static final String STANDARD = "standard";
public static final String BOLD_ITALIC = "bold, italic";
public static final String BOLD = "bold";
public static final String ITALIC = "italic";
public static final Pattern FONT_CLEANER = Pattern.compile(",bold|,italic");
private int page;
@Builder.Default
private List<Character> characters = new ArrayList<>();
private boolean isParagraphStart;
private boolean strikethrough;
private boolean underline;
private Integer hashcodeCache;
public Word(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
this.characters = textPositions.stream()
.map(RedTextPosition::fromTextPosition)
.map(Character::new)
.collect(Collectors.toList());
this.page = pageNumber;
this.isParagraphStart = isParagraphStart;
calculateBBoxAndHashcode();
}
private void calculateBBoxAndHashcode() {
setToBBoxOfComponents(getTextPositions());
hashcodeCache = null;
}
public Word(List<Character> textPositions, int page) {
this.characters = new ArrayList<>(textPositions);
this.page = page;
calculateBBoxAndHashcode();
}
@Override
public int length() {
return characters.size();
}
@Override
public char charAt(int index) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
public char charAt(int index, boolean caseInSensitive) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase(Locale.ROOT).charAt(0) : text.charAt(0);
}
@Override
public Word subSequence(int start, int end) {
var textPositionSequence = new Word();
textPositionSequence.characters = characters.subList(start, end);
textPositionSequence.page = page;
textPositionSequence.dir = dir;
textPositionSequence.setToBBoxOfComponents(getTextPositions());
return textPositionSequence;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder(length());
for (int i = 0; i < length(); i++) {
builder.append(charAt(i));
}
return builder.toString();
}
public RedTextPosition textPositionAt(int index) {
return characters.get(index).getTextPosition();
}
public void add(Word word, RedTextPosition textPosition) {
this.characters.add(new Character(textPosition));
this.page = word.getPage();
calculateBBoxAndHashcode();
}
public void add(Character current) {
characters.add(current);
calculateBBoxAndHashcode();
}
public void add(TextPosition textPosition) {
add(new Character(RedTextPosition.fromTextPosition(textPosition)));
}
public double getTextHeightNoPadding() {
return characters.get(0).getTextPosition().getHeightDirAdj();
}
public double getTextHeight() {
return characters.get(0).getTextPosition().getHeightDirAdj() + HEIGHT_PADDING;
}
public String getFont() {
if (characters.get(0).getTextPosition().getFontName() == null) {
return "none";
}
return FONT_CLEANER.matcher(characters.get(0).getTextPosition().getFontName().toLowerCase(Locale.ROOT)).replaceAll("");
}
public String getFontStyle() {
if (characters.get(0).getTextPosition().getFontName() == null) {
return STANDARD;
}
String lowercaseFontName = characters.get(0).getTextPosition().getFontName().toLowerCase(Locale.ROOT);
if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) {
return BOLD_ITALIC;
} else if (lowercaseFontName.contains(BOLD)) {
return BOLD;
} else if (lowercaseFontName.contains(ITALIC)) {
return ITALIC;
} else {
return STANDARD;
}
}
public float getFontSize() {
return characters.get(0).getTextPosition().getFontSizeInPt();
}
public float getSpaceWidth() {
return characters.get(0).getTextPosition().getWidthOfSpace();
}
public boolean equals(final Object o) {
// auto-generated with lombok
if (o == this) {
return true;
}
if (!(o instanceof Word other)) {
return false;
}
if (!other.canEqual((Object) this)) {
return false;
}
if (!super.equals(o)) {
return false;
}
if (this.getPage() != other.getPage()) {
return false;
}
final Object this$textPositions = this.getTextPositions();
final Object other$textPositions = other.getTextPositions();
if (!Objects.equals(this$textPositions, other$textPositions)) {
return false;
}
return Objects.equals(this.getHashcodeCache(), other.getHashcodeCache());
}
protected boolean canEqual(final Object other) {return other instanceof Word;}
public int hashCode() {
if (hashcodeCache == null) {
hashcodeCache = hashcodeCalculation();
}
return hashcodeCache;
}
private int hashcodeCalculation() {
final int PRIME = 59;
int result = super.hashCode();
result = result * PRIME + this.getPage();
final Object $textPositions = this.getTextPositions();
result = result * PRIME + ($textPositions == null ? 43 : $textPositions.hashCode());
return result;
}
private List<RedTextPosition> getTextPositions() {
return characters.stream()
.map(Character::getTextPosition)
.toList();
}
public void transform(AffineTransform rotateInstance) {
for (RedTextPosition textPosition : getTextPositions()) {
Rectangle2D exactDirAdjCoordinates = rotateInstance.createTransformedShape(textPosition.getBBoxDirAdj()).getBounds2D();
textPosition.setBBoxDirAdj(exactDirAdjCoordinates);
}
calculateBBoxAndHashcode();
}
}

View File

@ -9,9 +9,9 @@ import java.util.Map;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
@ -56,7 +56,7 @@ public class ImageServiceResponseAdapter {
classificationPage.getImages().forEach(image -> { classificationPage.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) { if (image.getImageType().equals(ImageType.OTHER)) {
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) { for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
if (image.getPosition().contains(textblock.getBBoxPdf())) { if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) {
image.setImageType(ImageType.OCR); image.setImageType(ImageType.OCR);
return; return;
} }

View File

@ -8,7 +8,7 @@ import java.util.Map;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingBox; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingBox;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
@ -79,7 +79,7 @@ public class VisualLayoutParsingAdapter {
ClassifiedImage signature = new ClassifiedImage(new Rectangle2D.Float(t.getBox().getX1(), ClassifiedImage signature = new ClassifiedImage(new Rectangle2D.Float(t.getBox().getX1(),
t.getBox().getY1(), t.getBox().getY1(),
t.getBox().getX2() - t.getBox().getX1(), t.getBox().getX2() - t.getBox().getX1(),
t.getBox().getY2() - t.getBox().getY1()), ImageType.SIGNATURE, true, false, false, pageNumber, ""); t.getBox().getY2() - t.getBox().getY1()), ImageType.SIGNATURE, true, false, false, pageNumber,"");
signatures.add(signature); signatures.add(signature);
} }

View File

@ -14,7 +14,6 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor @AllArgsConstructor
public class Classification { public class Classification {
@Builder.Default
private Map<String, Float> probabilities = new HashMap<>(); private Map<String, Float> probabilities = new HashMap<>();
private String label; private String label;

View File

@ -22,10 +22,8 @@ public class ImageServiceResponse {
@JsonProperty(value = "imageMetadata") @JsonProperty(value = "imageMetadata")
@JsonAlias("data") @JsonAlias("data")
@Builder.Default
private List<ImageMetadata> data = new ArrayList<>(); private List<ImageMetadata> data = new ArrayList<>();
@Builder.Default
private List<ImageMetadata> dataCV = new ArrayList<>(); private List<ImageMetadata> dataCV = new ArrayList<>();

View File

@ -15,7 +15,6 @@ import lombok.NoArgsConstructor;
public class TableData { public class TableData {
private PageInfo pageInfo; private PageInfo pageInfo;
@Builder.Default
private List<TableCells> tableCells = new ArrayList<>(); private List<TableCells> tableCells = new ArrayList<>();
} }

View File

@ -19,7 +19,7 @@ public class TableServiceResponse {
private String operation; private String operation;
private String targetFileExtension; private String targetFileExtension;
private String responseFileExtension; private String responseFileExtension;
@Builder.Default
private List<TableData> data = new ArrayList<>(); private List<TableData> data = new ArrayList<>();
} }

View File

@ -33,7 +33,7 @@ public class BodyTextFrameService {
for (ClassificationPage page : classificationDocument.getPages()) { for (ClassificationPage page : classificationDocument.getPages()) {
var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame); var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classificationDocument.getLayoutDebugLayer().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber()); classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
} }
} }

View File

@ -23,11 +23,11 @@ public class DividingColumnDetectionService {
public List<Rectangle2D> detectColumns(PageContents pageContents) { public List<Rectangle2D> detectColumns(PageContents pageContents) {
if (pageContents.getSortedWords().size() < 2) { if (pageContents.getSortedTextPositionSequences().size() < 2) {
return List.of(pageContents.getCropBox()); return List.of(pageContents.getCropBox());
} }
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), pageContents.getCropBox()); GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), pageContents.getCropBox());
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox()); return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
} }

View File

@ -5,7 +5,8 @@ import java.util.LinkedList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -18,26 +19,26 @@ public class GapDetectionService {
private static final double NEW_LINE_FACTOR = 0.2; private static final double NEW_LINE_FACTOR = 0.2;
public static GapInformation findGapsInLines(List<Word> sortedWords, Rectangle2D mainBodyTextFrame) { public static GapInformation findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
if (sortedWords.isEmpty()) { if (sortedTextPositionSequences.isEmpty()) {
return new GapInformation(); return new GapInformation();
} }
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedWords); final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame); XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame);
YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame); YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame);
var previousTextPosition = sortedWords.get(0); var previousTextPosition = sortedTextPositionSequences.get(0);
Rectangle2D rectangle = toRectangle2D(previousTextPosition); Rectangle2D rectangle = toRectangle2D(previousTextPosition);
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle); xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
for (Word currentTextPosition : sortedWords.subList(1, sortedWords.size())) { for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj()); double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()); double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition); Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition); Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
@ -59,14 +60,14 @@ public class GapDetectionService {
} }
previousTextPosition = currentTextPosition; previousTextPosition = currentTextPosition;
} }
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedWords.get(sortedWords.size() - 1))); xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1)));
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine); xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine); return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
} }
private static Rectangle2D toRectangle2D(Word textPosition) { private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
return mirrorY(textPosition.getBBox()); return mirrorY(textPosition.getBBox());
} }
@ -87,18 +88,18 @@ public class GapDetectionService {
} }
private static void assertAllTextPositionsHaveSameDir(List<Word> words) { private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
assert words.stream() assert textPositionSequences.stream()
.map(Word::getDir) .map(TextPositionSequence::getDir)
.allMatch(a -> a.equals(words.get(0).getDir())); .allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
} }
private static double getAvgTextPositionHeight(List<Word> words) { private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
return words.stream() return textPositionSequences.stream()
.mapToDouble(Word::getHeight).average().orElseThrow(); .mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
} }

View File

@ -7,17 +7,17 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class InvisibleTableDetectionService { public class InvisibleTableDetectionService {
public List<List<Rectangle2D>> detectTable(List<Word> words, Rectangle2D tableBBox) { public List<List<Rectangle2D>> detectTable(List<TextPositionSequence> textPositionSequences, Rectangle2D tableBBox) {
LineInformation lineInformation = LineDetectionService.calculateLineInformation(words); LineInformation lineInformation = LineDetectionService.calculateLineInformation(textPositionSequences);
GapInformation gaps = GapDetectionService.findGapsInLines(words, tableBBox); GapInformation gaps = GapDetectionService.findGapsInLines(textPositionSequences, tableBBox);
List<Rectangle2D> gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox); List<Rectangle2D> gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox);
List<Double> columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList(); List<Double> columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList();
int colCount = gapsAcrossLines.size(); int colCount = gapsAcrossLines.size();

Some files were not shown because too many files have changed in this diff Show More