Compare commits
98 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef23ee0ade | ||
|
|
af31f52b47 | ||
|
|
b5152112ee | ||
|
|
85ea4ef455 | ||
|
|
01f8c01fff | ||
|
|
0b6a292c75 | ||
|
|
e24020589c | ||
|
|
c619b845e8 | ||
|
|
ed0371ca11 | ||
|
|
89b5be8d67 | ||
|
|
077ce60c9d | ||
|
|
ab171be6e2 | ||
|
|
664b47b4c3 | ||
|
|
8005c1f25f | ||
|
|
42185a95a0 | ||
|
|
51b42efaf6 | ||
|
|
6a50d45947 | ||
|
|
073ac12cf7 | ||
|
|
84b054a4cc | ||
|
|
905b65a5fa | ||
|
|
7617c1f308 | ||
|
|
2b3936c09b | ||
|
|
6e5b1f1978 | ||
|
|
cf846d18bc | ||
|
|
25c46f16ac | ||
|
|
96acefed78 | ||
|
|
366241e6c6 | ||
|
|
7f472ccc52 | ||
|
|
6f807c7d94 | ||
|
|
6e04c15f3d | ||
|
|
1384584e2f | ||
|
|
e58011e111 | ||
|
|
a821570065 | ||
|
|
7ee1f9e360 | ||
|
|
f9b25c8157 | ||
|
|
c90874da7a | ||
|
|
4683c696a5 | ||
|
|
95c02ce3cf | ||
|
|
b2d62e32fe | ||
|
|
65c1f03ea3 | ||
|
|
2219519a2b | ||
|
|
af05218e37 | ||
|
|
736f531df3 | ||
|
|
c64445d54b | ||
|
|
af29233b10 | ||
|
|
5f04b45554 | ||
|
|
6c41533f0b | ||
|
|
9d2596e5ef | ||
|
|
e7b01161ac | ||
|
|
7b073eb4f3 | ||
|
|
4b0c041d84 | ||
|
|
6c7442ac6d | ||
|
|
23e23328ee | ||
|
|
9d1ffdd779 | ||
|
|
3109a30ae1 | ||
|
|
fe2ed1807e | ||
|
|
31de229fa5 | ||
|
|
8a80abfff1 | ||
|
|
7c08905eda | ||
|
|
4f40c9dbc9 | ||
|
|
32381b4472 | ||
|
|
469da38952 | ||
|
|
0f8c4674b3 | ||
|
|
8e165a41d7 | ||
|
|
ed7a701ad9 | ||
|
|
393103e074 | ||
|
|
bd02066e2c | ||
|
|
fec19f4afb | ||
|
|
c726a643f0 | ||
|
|
519e95735c | ||
|
|
b52af2637f | ||
|
|
46ea7edc4c | ||
|
|
9650195afd | ||
|
|
ce628a99f7 | ||
|
|
b66afe135c | ||
|
|
dc892d0fec | ||
|
|
af45f2cd8c | ||
|
|
befb6b1df6 | ||
|
|
61efb4cae9 | ||
|
|
4a06059258 | ||
|
|
292e5b215e | ||
|
|
7c2db6c3c5 | ||
|
|
4395074b21 | ||
|
|
8e14b74da2 | ||
|
|
3b91639ea9 | ||
|
|
c5178ea5c2 | ||
|
|
cf39d4dfcc | ||
|
|
bb40345f79 | ||
|
|
e3e9d16145 | ||
|
|
f6ca5a3c17 | ||
|
|
15e3dced35 | ||
|
|
933054b332 | ||
|
|
ab86714cb3 | ||
|
|
8626b106d0 | ||
|
|
52e948e66c | ||
|
|
3b33405cbf | ||
|
|
b2fa14dde2 | ||
|
|
62e07686d7 |
@ -21,5 +21,6 @@ deploy:
|
||||
dotenv: version.env
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||
- if: $CI_COMMIT_BRANCH =~ /^feature/ && $CI_COMMIT_TAG == ""
|
||||
- if: $CI_COMMIT_BRANCH =~ /^release/
|
||||
- if: $CI_COMMIT_TAG
|
||||
|
||||
@ -8,6 +8,8 @@ plugins {
|
||||
|
||||
group = "com.knecon.fforesight"
|
||||
|
||||
val documentVersion by rootProject.extra { "4.433.0" }
|
||||
|
||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||
java.targetCompatibility = JavaVersion.VERSION_17
|
||||
|
||||
@ -51,6 +53,10 @@ allprojects {
|
||||
}
|
||||
}
|
||||
|
||||
pmd {
|
||||
setConsoleOutput(true)
|
||||
}
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
create<MavenPublication>(name) {
|
||||
|
||||
@ -1,28 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@Schema(description = "Object containing the complete document layout parsing information. It is split into 4 categories, structure, text, positions and pages: " + "The document tree structure of SemanticNodes such as Section, Paragraph, Headline, etc. " + "The text, which is stored as separate blocks of data. " + "The text positions, which are also stored as separate blocks. The Blocks are equal to the text blocks in length and order. " + "The page information.")
|
||||
public class DocumentData implements Serializable {
|
||||
|
||||
@Schema(description = "Contains information about the document's pages.")
|
||||
DocumentPage[] documentPages;
|
||||
@Schema(description = "Contains information about the document's text.")
|
||||
DocumentTextData[] documentTextData;
|
||||
@Schema(description = "Contains information about the document's text positions.")
|
||||
DocumentPositionData[] documentPositions;
|
||||
@Schema(description = "Contains information about the document's semantic structure.")
|
||||
DocumentStructure documentStructure;
|
||||
|
||||
}
|
||||
@ -1,30 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@Schema(description = "Object containing information about the document's pages.")
|
||||
public class DocumentPage implements Serializable {
|
||||
|
||||
@Schema(description = "The page number, starting with 1.")
|
||||
int number;
|
||||
@Schema(description = "The page height in PDF user units.", example = "792")
|
||||
int height;
|
||||
@Schema(description = "The page width in PDF user units.", example = "694")
|
||||
int width;
|
||||
@Schema(description = "The page rotation as specified by the PDF.", example = "90", allowableValues = {"0", "90", "180", "270"})
|
||||
int rotation;
|
||||
|
||||
}
|
||||
@ -1,28 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@Schema(description = "Object containing text positional information of a specific text block. A document is split into multiple text blocks, which are supposed to be read in order. Every text block can only occur on a single page.")
|
||||
public class DocumentPositionData implements Serializable {
|
||||
|
||||
@Schema(description = "Identifier of the text block.")
|
||||
Long id;
|
||||
@Schema(description = "For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate. This is required due to the text and position coordinates not being equal.")
|
||||
int[] stringIdxToPositionIdx;
|
||||
@Schema(description = "The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block. The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner. In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.")
|
||||
float[][] positions;
|
||||
|
||||
}
|
||||
@ -1,172 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@Schema(description = "Object containing information about the parsed tree structure of the SemanticNodes, such as Section, Paragraph, Headline etc inside of the document.")
|
||||
public class DocumentStructure implements Serializable {
|
||||
|
||||
@Schema(description = "The root EntryData represents the Document.")
|
||||
EntryData root;
|
||||
|
||||
@Schema(description = "Object containing the extra field names, a table has in its properties field.")
|
||||
public static class TableProperties implements Serializable {
|
||||
|
||||
public static final String NUMBER_OF_ROWS = "numberOfRows";
|
||||
public static final String NUMBER_OF_COLS = "numberOfCols";
|
||||
|
||||
}
|
||||
|
||||
@Schema(description = "Object containing the extra field names, an Image has in its properties field.")
|
||||
public static class ImageProperties implements Serializable {
|
||||
|
||||
public static final String TRANSPARENT = "transparent";
|
||||
public static final String IMAGE_TYPE = "imageType";
|
||||
public static final String POSITION = "position";
|
||||
public static final String ID = "id";
|
||||
|
||||
public static final String REPRESENTATION_HASH = "representationHash";
|
||||
|
||||
}
|
||||
|
||||
@Schema(description = "Object containing the extra field names, a table cell has in its properties field.")
|
||||
public static class TableCellProperties implements Serializable {
|
||||
|
||||
public static final String B_BOX = "bBox";
|
||||
public static final String ROW = "row";
|
||||
public static final String COL = "col";
|
||||
public static final String HEADER = "header";
|
||||
|
||||
}
|
||||
|
||||
@Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.")
|
||||
public static class DuplicateParagraphProperties implements Serializable {
|
||||
|
||||
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
|
||||
|
||||
}
|
||||
|
||||
public static final String RECTANGLE_DELIMITER = ";";
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER))
|
||||
.map(Float::parseFloat)
|
||||
.toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
public static double[] parseRepresentationVector(String representationHash) {
|
||||
|
||||
String[] stringArray = representationHash.split("[,\\s]+");
|
||||
double[] doubleArray = new double[stringArray.length];
|
||||
for (int i = 0; i < stringArray.length; i++) {
|
||||
doubleArray[i] = Double.parseDouble(stringArray[i]);
|
||||
}
|
||||
|
||||
return doubleArray;
|
||||
}
|
||||
|
||||
|
||||
public EntryData get(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
EntryData entry = root.children.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<EntryData> streamAllEntries() {
|
||||
|
||||
return Stream.concat(Stream.of(root), root.children.stream())
|
||||
.flatMap(DocumentStructure::flatten);
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n",
|
||||
streamAllEntries().map(EntryData::toString)
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<EntryData> flatten(EntryData entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry),
|
||||
entry.children.stream()
|
||||
.flatMap(DocumentStructure::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@Schema(description = "Object containing information of a SemanticNode and also structuring the layout with children.")
|
||||
public static class EntryData implements Serializable {
|
||||
|
||||
@Schema(description = "Type of the semantic node.", allowableValues = {"DOCUMENT", "SECTION", "PARAGRAPH", "HEADLINE", "TABLE", "TABLE_CELL", "HEADER", "FOOTER", "IMAGE"})
|
||||
NodeType type;
|
||||
@Schema(description = "Specifies the position in the parsed tree structure.", example = "[1, 0, 2]")
|
||||
int[] treeId;
|
||||
@Schema(description = "Specifies the text block IDs associated with this semantic node. The value should be joined with the DocumentTextData/DocumentPositionData. Is empty, if no text block is directly associated with this semantic node. Only Paragraph, Headline, Header or Footer is directly associated with a text block.", example = "[1]")
|
||||
Long[] atomicBlockIds;
|
||||
@Schema(description = "Specifies the pages this semantic node appears on. The value should be joined with the PageData.", example = "[1, 2, 3]")
|
||||
Long[] pageNumbers;
|
||||
@Schema(description = "Some semantic nodes have additional information, this information is stored in this Map. The extra fields are specified by the Properties subclasses.", example = "For a Table: {\"numberOfRows\": 3, \"numberOfCols\": 4}")
|
||||
Map<String, String> properties;
|
||||
@Schema(description = "All child Entries of this Entry.", example = "[1, 2, 3]")
|
||||
List<EntryData> children;
|
||||
@Schema(description = "Describes the origin of the semantic node", example = "[ALGORITHM]")
|
||||
Set<LayoutEngine> engines;
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("[");
|
||||
for (int i : treeId) {
|
||||
sb.append(i);
|
||||
sb.append(",");
|
||||
}
|
||||
sb.delete(sb.length() - 1, sb.length());
|
||||
sb.append("]: ");
|
||||
|
||||
sb.append(type);
|
||||
sb.append(" atbs = ");
|
||||
sb.append(atomicBlockIds.length);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,36 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@Schema(description = "Object containing text information of a specific text block. A document is split into multiple text blocks, which are supposed to be read in order. Every text block can only occur on a single page.")
|
||||
public class DocumentTextData implements Serializable {
|
||||
|
||||
@Schema(description = "Identifier of the text block.")
|
||||
Long id;
|
||||
@Schema(description = "The page the text block occurs on.")
|
||||
Long page;
|
||||
@Schema(description = "The text the text block.")
|
||||
String searchText;
|
||||
@Schema(description = "Each text block is assigned a number on a page, starting from 0.")
|
||||
int numberOnPage;
|
||||
@Schema(description = "The text blocks are ordered, this number represents the start of the text block as a string offset.")
|
||||
int start;
|
||||
@Schema(description = "The text blocks are ordered, this number represents the end of the text block as a string offset.")
|
||||
int end;
|
||||
@Schema(description = "The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.", example = "[5, 10]")
|
||||
int[] lineBreaks;
|
||||
|
||||
}
|
||||
@ -1,7 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
public enum LayoutEngine {
|
||||
ALGORITHM,
|
||||
AI,
|
||||
OUTLINE
|
||||
}
|
||||
@ -1,23 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Locale;
|
||||
|
||||
public enum NodeType implements Serializable {
|
||||
DOCUMENT,
|
||||
SECTION,
|
||||
SUPER_SECTION,
|
||||
HEADLINE,
|
||||
PARAGRAPH,
|
||||
TABLE,
|
||||
TABLE_CELL,
|
||||
IMAGE,
|
||||
HEADER,
|
||||
FOOTER;
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
}
|
||||
@ -21,5 +21,14 @@ public class SimplifiedText {
|
||||
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
|
||||
@Builder.Default
|
||||
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
|
||||
@Schema(description = "A list of the main section numbers ")
|
||||
@Builder.Default
|
||||
private List<String> mainSectionNumbers = new ArrayList<>();
|
||||
@Schema(description = "A list of the header section numbers ")
|
||||
@Builder.Default
|
||||
private List<String> headerSectionNumbers = new ArrayList<>();
|
||||
@Schema(description = "A list of the footer section numbers ")
|
||||
@Builder.Default
|
||||
private List<String> footerSectionNumbers = new ArrayList<>();
|
||||
|
||||
}
|
||||
|
||||
@ -8,13 +8,20 @@ import lombok.Builder;
|
||||
@Builder
|
||||
@Schema(description = "Object containing information about the layout parsing.")
|
||||
public record LayoutParsingFinishedEvent(
|
||||
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.")
|
||||
Map<String, String> identifier,//
|
||||
@Schema(description = "The duration of a single layout parsing in ms.")
|
||||
long duration,//
|
||||
@Schema(description = "The number of pages of the parsed document.")
|
||||
int numberOfPages,//
|
||||
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.")
|
||||
String message) {
|
||||
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") //
|
||||
Map<String, String> identifier,
|
||||
|
||||
@Schema(description = "The duration of a single layout parsing in ms.") //
|
||||
long duration,
|
||||
|
||||
@Schema(description = "The number of pages of the parsed document.") //
|
||||
int numberOfPages,
|
||||
|
||||
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
|
||||
String message,
|
||||
|
||||
@Schema(description = "The app version of the layout parser.") //
|
||||
String layoutParserVersion
|
||||
) {
|
||||
|
||||
}
|
||||
|
||||
@ -2,6 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
public class LayoutParsingQueueNames {
|
||||
|
||||
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue";
|
||||
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue";
|
||||
public static final String LAYOUT_PARSING_REQUEST_QUEUE_PREFIX = "layout_parsing_request";
|
||||
public static final String LAYOUT_PARSING_REQUEST_EXCHANGE = "layout_parsing_request_exchange";
|
||||
public static final String LAYOUT_PARSING_RESPONSE_QUEUE_PREFIX = "layout_parsing_response";
|
||||
public static final String LAYOUT_PARSING_RESPONSE_EXCHANGE = "layout_parsing_response_exchange";
|
||||
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_error";
|
||||
}
|
||||
|
||||
@ -8,16 +8,20 @@ description = "layoutparser-service-processor"
|
||||
val jacksonVersion = "2.15.2"
|
||||
val pdfBoxVersion = "3.0.0"
|
||||
|
||||
|
||||
dependencies {
|
||||
implementation(project(":layoutparser-service-internal-api"))
|
||||
implementation(project(":viewer-doc-processor"))
|
||||
|
||||
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.144.0") {
|
||||
implementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}")
|
||||
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") {
|
||||
exclude("org.springframework.boot", "spring-boot-starter-security")
|
||||
exclude("org.springframework.boot", "spring-boot-starter-validation")
|
||||
}
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.21.0")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.45.0")
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
|
||||
exclude("com.iqser.red.commons", "storage-commons")
|
||||
}
|
||||
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||
|
||||
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
@ -25,9 +29,12 @@ dependencies {
|
||||
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
||||
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
||||
implementation("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||
implementation("com.github.jai-imageio:jai-imageio-core:1.4.0")
|
||||
implementation("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
|
||||
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
||||
implementation("org.commonmark:commonmark:0.22.0")
|
||||
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||
implementation("com.pdftron:PDFNet:10.11.0")
|
||||
|
||||
implementation("org.apache.commons:commons-text:1.12.0")
|
||||
}
|
||||
|
||||
@ -13,9 +13,8 @@ import lombok.experimental.FieldDefaults;
|
||||
@Configuration
|
||||
@ConfigurationProperties("layoutparser")
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutparserSettings {
|
||||
public class LayoutParserSettings {
|
||||
|
||||
boolean debug;
|
||||
LayoutParsingType layoutParsingTypeOverride;
|
||||
String pdftronLicense;
|
||||
}
|
||||
@ -2,12 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -19,28 +20,35 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.markdown.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter;
|
||||
@ -48,7 +56,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
@ -58,13 +65,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.blockificat
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClarifyndClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
@ -85,32 +88,32 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutParsingPipeline {
|
||||
|
||||
ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
CvTableParsingAdapter cvTableParsingAdapter;
|
||||
LayoutParsingStorageService layoutParsingStorageService;
|
||||
SectionsBuilderService sectionsBuilderService;
|
||||
RedactManagerClassificationService redactManagerClassificationService;
|
||||
DocuMineClassificationService docuMineClassificationService;
|
||||
SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
BodyTextFrameService bodyTextFrameService;
|
||||
RulingCleaningService rulingCleaningService;
|
||||
TableExtractionService tableExtractionService;
|
||||
DocuMineBlockificationService docuMineBlockificationService;
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
BlockificationPostprocessingService blockificationPostprocessingService;
|
||||
DocstrumBlockificationService docstrumBlockificationService;
|
||||
LayoutGridService layoutGridService;
|
||||
ObservationRegistry observationRegistry;
|
||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
ClarifyndClassificationService clarifyndClassificationService;
|
||||
GraphicExtractorService graphicExtractorService;
|
||||
OutlineExtractorService outlineExtractorService;
|
||||
OutlineValidationService outlineValidationService;
|
||||
TOCEnrichmentService tocEnrichmentService;
|
||||
LayoutparserSettings settings;
|
||||
final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
final LayoutParsingStorageService layoutParsingStorageService;
|
||||
final SectionsBuilderService sectionsBuilderService;
|
||||
final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
final RulingCleaningService rulingCleaningService;
|
||||
final TableExtractionService tableExtractionService;
|
||||
final DocuMineBlockificationService docuMineBlockificationService;
|
||||
final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
final BlockificationPostprocessingService blockificationPostprocessingService;
|
||||
final DocstrumBlockificationService docstrumBlockificationService;
|
||||
final LayoutGridService layoutGridService;
|
||||
final ObservationRegistry observationRegistry;
|
||||
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
final GraphicExtractorService graphicExtractorService;
|
||||
final OutlineExtractorService outlineExtractorService;
|
||||
final SectionTreeBuilderService sectionTreeBuilderService;
|
||||
final SectionTreeEnhancementService sectionTreeEnhancementService;
|
||||
final LayoutParserSettings settings;
|
||||
final ClassificationService classificationService;
|
||||
|
||||
@Value("${LAYOUT_PARSER_VERSION:}")
|
||||
private String layoutParserVersion;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
@ -119,17 +122,23 @@ public class LayoutParsingPipeline {
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
.orElse(originFile);
|
||||
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
|
||||
.orElse(new VisualLayoutParsingResponse());
|
||||
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
||||
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
|
||||
.map(layoutParsingStorageService::getImagesFile)
|
||||
.orElse(new ImageServiceResponse());
|
||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
|
||||
.map(layoutParsingStorageService::getTablesFile)
|
||||
.orElse(new TableServiceResponse());
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
|
||||
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
@ -138,36 +147,37 @@ public class LayoutParsingPipeline {
|
||||
|
||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||
|
||||
Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
|
||||
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument);
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false);
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph));
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
|
||||
}
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
|
||||
if (layoutParsingRequest.researchDocumentStorageId() != null) {
|
||||
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentWithVisualization.document());
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
|
||||
if (!viewerDocumentFile.equals(originFile)) {
|
||||
viewerDocumentFile.delete();
|
||||
assert !viewerDocumentFile.exists() || viewerDocumentFile.delete();
|
||||
}
|
||||
originFile.delete();
|
||||
assert !originFile.exists() || originFile.delete();
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(documentGraph.getNumberOfPages())
|
||||
.numberOfPages(documentWithVisualization.document().getNumberOfPages())
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
@ -182,21 +192,22 @@ public class LayoutParsingPipeline {
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.layoutParserVersion(layoutParserVersion)
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Document observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
|
||||
private DocumentWithVisualization observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
|
||||
|
||||
AtomicReference<Document> documentReference = new AtomicReference<>();
|
||||
AtomicReference<DocumentWithVisualization> documentReference = new AtomicReference<>();
|
||||
|
||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
|
||||
.contextualName("build-document-graph")
|
||||
@ -243,12 +254,8 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
OutlineObject lastProcessedOutlineObject = null;
|
||||
|
||||
// parsing the structure elements could be useful as well
|
||||
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
}
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
@ -273,22 +280,22 @@ public class LayoutParsingPipeline {
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(originDocument);
|
||||
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
||||
List<Word> words = stripper.getWords();
|
||||
|
||||
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
|
||||
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
||||
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
||||
words = TextPositionOperations.sortLines(lines);
|
||||
words = TextPositionOperations.sortWords(lines);
|
||||
}
|
||||
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
classificationDocument.getLayoutDebugLayer().addRulingVisualization(stripper.getRulings(), pageNumber);
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
List<Ruling> rulings = stripper.getRulings();
|
||||
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
|
||||
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||
@ -296,7 +303,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
@ -308,8 +315,7 @@ public class LayoutParsingPipeline {
|
||||
.toList());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
@ -317,26 +323,9 @@ public class LayoutParsingPipeline {
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
};
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
|
||||
|
||||
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||
|
||||
OutlineObject notFoundOutlineObject = null;
|
||||
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
|
||||
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||
}
|
||||
if (!outlineObjects.isEmpty()) {
|
||||
classificationPage.setOutlineObjects(outlineObjects);
|
||||
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||
}
|
||||
}
|
||||
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
|
||||
|
||||
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
@ -366,40 +355,67 @@ public class LayoutParsingPipeline {
|
||||
|
||||
originDocument.close();
|
||||
|
||||
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
classificationDocument.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
||||
}
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||
|
||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
|
||||
classificationDocument.setTableOfContents(tableOfContents);
|
||||
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
|
||||
classificationDocument.setSectionTree(sectionTree);
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||
default -> tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
|
||||
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
|
||||
}
|
||||
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
|
||||
private static void updateClassificationPage(PDPage pdPage,
|
||||
PDRectangle pdr,
|
||||
ClassificationPage classificationPage,
|
||||
CleanRulings cleanRulings,
|
||||
int pageNumber,
|
||||
PageInformation pageInformation) {
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth((float) pageInformation.width());
|
||||
classificationPage.setPageHeight((float) pageInformation.height());
|
||||
}
|
||||
|
||||
|
||||
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
|
||||
|
||||
for (TextDirection dir : TextDirection.values()) {
|
||||
double averageRotation = words.stream()
|
||||
.map(Word::getCharacters)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Character::getTextPosition)
|
||||
.filter(pos -> pos.getDir().equals(dir))
|
||||
.mapToDouble(RedTextPosition::getExactDir).average()
|
||||
.orElse(0);
|
||||
|
||||
if (averageRotation == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
|
||||
|
||||
for (Word word : words) {
|
||||
if (!dir.equals(word.getDir())) {
|
||||
continue;
|
||||
}
|
||||
word.transform(rotateInstance);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
|
||||
|
||||
if (observationRegistry.getCurrentObservation() != null) {
|
||||
@ -441,10 +457,10 @@ public class LayoutParsingPipeline {
|
||||
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
if (((TextPageBlock) textBlock).getSequences() == null) {
|
||||
if (((TextPageBlock) textBlock).getWords() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
||||
for (Word word : ((TextPageBlock) textBlock).getWords()) {
|
||||
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
||||
classificationPage.getFontCounter().add(word.getFont());
|
||||
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
||||
|
||||
@ -11,12 +11,14 @@ import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import org.springframework.core.task.TaskExecutor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
@ -39,6 +41,8 @@ public class LayoutParsingStorageService {
|
||||
private final StorageService storageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
private final TaskExecutor taskExecutor;
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
|
||||
public File getOriginFile(String storageId) throws IOException {
|
||||
@ -100,13 +104,35 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages());
|
||||
Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
documentData.getDocumentStructure());
|
||||
|
||||
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
documentData.getDocumentTextData());
|
||||
|
||||
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
documentData.getDocumentPositionData());
|
||||
|
||||
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
documentData.getDocumentPages());
|
||||
|
||||
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
|
||||
|
||||
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Zon
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -35,7 +35,7 @@ public class DocstrumSegmentationService {
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
|
||||
public List<Zone> segmentPage(List<Word> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
|
||||
|
||||
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
|
||||
|
||||
@ -78,18 +78,14 @@ public class DocstrumSegmentationService {
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
|
||||
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
|
||||
|
||||
List<RedTextPosition> positions = textPositions.stream()
|
||||
List<Character> characters = textPositions.stream()
|
||||
.filter(t -> t.getDir() == direction)
|
||||
.map(TextPositionSequence::getTextPositions)
|
||||
.map(Word::getCharacters)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
|
||||
List<Character> characters = positions.stream()
|
||||
.map(Character::new)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(characters);
|
||||
|
||||
double characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||
|
||||
@ -133,7 +133,7 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
private boolean intersectsX(BoundingBox other, float threshold) {
|
||||
public boolean intersectsX(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
|
||||
}
|
||||
@ -225,33 +225,31 @@ public abstract class BoundingBox {
|
||||
|
||||
public double horizontalDistance(BoundingBox other) {
|
||||
|
||||
Rectangle2D left;
|
||||
Rectangle2D right;
|
||||
if (this.leftOf(other)) {
|
||||
left = this.getBBox();
|
||||
right = other.getBBox();
|
||||
} else {
|
||||
left = other.getBBox();
|
||||
right = this.getBBox();
|
||||
}
|
||||
double rect1Right = getMaxX();
|
||||
double rect1Left = getMinX();
|
||||
double rect2Right = other.getMaxX();
|
||||
double rect2Left = other.getMinX();
|
||||
|
||||
return Math.max(0, right.getMinX() - left.getMaxX());
|
||||
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(BoundingBox other) {
|
||||
|
||||
Rectangle2D bottom;
|
||||
Rectangle2D top;
|
||||
if (this.isAbove(other)) {
|
||||
top = this.getBBox();
|
||||
bottom = other.getBBox();
|
||||
} else {
|
||||
bottom = this.getBBox();
|
||||
top = other.getBBox();
|
||||
}
|
||||
double rect1Top = getMaxY();
|
||||
double rect1Bottom = getMinY();
|
||||
double rect2Top = other.getMaxY();
|
||||
double rect2Bottom = other.getMinY();
|
||||
|
||||
return Math.max(0, bottom.getMinY() - top.getMaxY());
|
||||
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD_ITALIC;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.ITALIC;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.STANDARD;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD_ITALIC;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.ITALIC;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.STANDARD;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
@ -14,7 +14,7 @@ import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
|
||||
|
||||
import lombok.Data;
|
||||
@ -24,7 +24,7 @@ import lombok.EqualsAndHashCode;
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||
public class Line extends TextBoundingBox {
|
||||
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.17;
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
private final double x0;
|
||||
@ -36,18 +36,13 @@ public class Line extends TextBoundingBox {
|
||||
@EqualsAndHashCode.Include
|
||||
private final double y1;
|
||||
|
||||
private final double height;
|
||||
|
||||
private FontStyle fontStyle;
|
||||
|
||||
private final List<Character> characters;
|
||||
private final List<TextPositionSequence> words = new ArrayList<>();
|
||||
private final List<Word> words;
|
||||
|
||||
|
||||
public Line(List<Character> characters, double wordSpacing) {
|
||||
|
||||
this.characters = characters;
|
||||
|
||||
if (characters.size() >= 2) {
|
||||
// linear regression
|
||||
double sx = 0.0;
|
||||
@ -76,20 +71,32 @@ public class Line extends TextBoundingBox {
|
||||
this.y0 = character.getY() - dy;
|
||||
this.y1 = character.getY() + dy;
|
||||
}
|
||||
height = computeHeight();
|
||||
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||
this.words = new ArrayList<>();
|
||||
computeWords(characters, wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||
buildBBox();
|
||||
computeFontStyle();
|
||||
}
|
||||
|
||||
|
||||
public Line(List<Word> words) {
|
||||
|
||||
this.words = words;
|
||||
buildBBox();
|
||||
x0 = getMinX();
|
||||
y0 = getMinY();
|
||||
x1 = getMaxX();
|
||||
y1 = getMaxY();
|
||||
computeFontStyle();
|
||||
}
|
||||
|
||||
|
||||
private void computeFontStyle() {
|
||||
|
||||
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
|
||||
for (FontStyle fontStyle : FontStyle.values()) {
|
||||
fontStyleCounter.put(fontStyle, new AtomicInteger(0));
|
||||
}
|
||||
for (TextPositionSequence word : words) {
|
||||
for (Word word : words) {
|
||||
switch (word.getFontStyle()) {
|
||||
case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement();
|
||||
case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement();
|
||||
@ -100,8 +107,7 @@ public class Line extends TextBoundingBox {
|
||||
fontStyle = fontStyleCounter.entrySet()
|
||||
.stream()
|
||||
.max(Comparator.comparing(entry -> entry.getValue().get()))
|
||||
.map(Map.Entry::getKey)
|
||||
.orElse(FontStyle.REGULAR);
|
||||
.map(Map.Entry::getKey).orElse(FontStyle.REGULAR);
|
||||
}
|
||||
|
||||
|
||||
@ -117,14 +123,6 @@ public class Line extends TextBoundingBox {
|
||||
}
|
||||
|
||||
|
||||
private double computeHeight() {
|
||||
|
||||
return characters.stream()
|
||||
.map(Character::getHeight)
|
||||
.reduce(0d, Double::sum) / characters.size();
|
||||
}
|
||||
|
||||
|
||||
public double angularDifference(Line j) {
|
||||
|
||||
double diff = Math.abs(getAngle() - j.getAngle());
|
||||
@ -157,19 +155,22 @@ public class Line extends TextBoundingBox {
|
||||
}
|
||||
|
||||
|
||||
private void computeWords(double wordSpacing) {
|
||||
private void computeWords(List<Character> characters, double wordSpacing) {
|
||||
|
||||
TextPositionSequence word = new TextPositionSequence();
|
||||
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
|
||||
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
|
||||
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
|
||||
Word word = new Word();
|
||||
Character previous = null;
|
||||
for (Character current : characters) {
|
||||
if (previous != null) {
|
||||
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
|
||||
if (dist > wordSpacing) {
|
||||
words.add(word);
|
||||
word = new TextPositionSequence();
|
||||
word = new Word();
|
||||
}
|
||||
}
|
||||
word.getTextPositions().add(current.getTextPosition());
|
||||
word.add(current);
|
||||
previous = current;
|
||||
}
|
||||
words.add(word);
|
||||
@ -178,9 +179,7 @@ public class Line extends TextBoundingBox {
|
||||
|
||||
private void buildBBox() {
|
||||
|
||||
this.setToBBoxOfComponents(characters.stream()
|
||||
.map(Character::getTextPosition)
|
||||
.toList());
|
||||
this.setToBBoxOfComponents(words);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -99,4 +99,82 @@ public abstract class TextBoundingBox extends BoundingBox {
|
||||
return this.bBoxDirAdj.getCenterX();
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistanceDirAdj(TextBoundingBox other) {
|
||||
|
||||
double rect1Right = getMaxXDirAdj();
|
||||
double rect1Left = getXDirAdj();
|
||||
double rect2Right = other.getMaxXDirAdj();
|
||||
double rect2Left = other.getXDirAdj();
|
||||
|
||||
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistanceDirAdj(TextBoundingBox other) {
|
||||
|
||||
double rect1Top = getMaxYDirAdj();
|
||||
double rect1Bottom = getYDirAdj();
|
||||
double rect2Top = other.getMaxYDirAdj();
|
||||
double rect2Bottom = other.getYDirAdj();
|
||||
|
||||
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.intersectsXDirAdj(other) && this.intersectsYDirAdj(other);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsDirAdj(TextBoundingBox other, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.intersectsXDirAdj(other, xThreshold) && this.intersectsYDirAdj(other, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXDirAdj(TextBoundingBox other, float threshold) {
|
||||
|
||||
return this.getXDirAdj() - threshold <= other.getMaxXDirAdj() && this.getMaxXDirAdj() + threshold >= other.getXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.getXDirAdj() <= other.getMaxXDirAdj() && this.getMaxXDirAdj() >= other.getXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.getYDirAdj() <= other.getMaxYDirAdj() && this.getMaxYDirAdj() >= other.getYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYDirAdj(TextBoundingBox other, float threshold) {
|
||||
|
||||
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean isAboveDirAdj(TextBoundingBox other) {
|
||||
|
||||
return other.isBelow(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean isBelowDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.intersectsXDirAdj(other) && this.getYDirAdj() >= other.getMaxYDirAdj();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -28,4 +28,10 @@ public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
|
||||
return setRep.values();
|
||||
}
|
||||
|
||||
|
||||
public Collection<T> getElements() {
|
||||
|
||||
return getParentMap().keySet();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -18,7 +18,6 @@ public class Zone extends TextBoundingBox {
|
||||
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
||||
public Zone(List<Line> lines) {
|
||||
|
||||
lines.sort(Comparator.comparingDouble(Line::getY0));
|
||||
this.lines = lines;
|
||||
setToBBoxOfComponents(lines);
|
||||
}
|
||||
|
||||
@ -17,7 +17,7 @@ public class LineBuilderService {
|
||||
|
||||
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
private static final double ANGLE_TOLERANCE = Math.toRadians(5);
|
||||
|
||||
|
||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||
|
||||
@ -1,9 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier.numericalIdentifierPattern;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
@ -11,11 +9,12 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
@Service
|
||||
public class ZoneBuilderService {
|
||||
@ -31,7 +30,7 @@ public class ZoneBuilderService {
|
||||
|
||||
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
private static final double ANGLE_TOLERANCE = Math.toRadians(5);
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
||||
|
||||
@ -114,64 +113,14 @@ public class ZoneBuilderService {
|
||||
|
||||
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double maxHorizontalDistance = 0;
|
||||
double minVerticalDistance = 0;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE;
|
||||
Set<Word> words = lines.stream()
|
||||
.map(Line::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toSet());
|
||||
Collection<Set<Word>> groupedLines = TextPositionOperations.groupByLine(words);
|
||||
|
||||
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
||||
|
||||
lines.forEach(outer -> {
|
||||
lines.forEach(inner -> {
|
||||
if (inner == outer) {
|
||||
return;
|
||||
}
|
||||
|
||||
double horizontalDistance = outer.horizontalDistance(inner);
|
||||
double verticalDistance = outer.verticalDistance(inner);
|
||||
|
||||
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
|
||||
|
||||
unionFind.union(outer, inner);
|
||||
|
||||
} else if (minVerticalDistance <= verticalDistance
|
||||
&& verticalDistance <= maxVerticalDistance
|
||||
&& Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) {
|
||||
|
||||
boolean characterOverlap = false;
|
||||
int overlappingCount = 0;
|
||||
for (Character outerCharacter : outer.getCharacters()) {
|
||||
for (Character innerCharacter : inner.getCharacters()) {
|
||||
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
|
||||
if (characterOverlapDistance > 2) {
|
||||
characterOverlap = true;
|
||||
}
|
||||
if (characterOverlapDistance > 0) {
|
||||
overlappingCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!characterOverlap && overlappingCount <= 2) {
|
||||
unionFind.union(outer, inner);
|
||||
}
|
||||
}
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
List<Line> outputZone = new ArrayList<>();
|
||||
for (Set<Line> group : unionFind.getGroups()) {
|
||||
List<Character> characters = new ArrayList<>();
|
||||
for (Line line : group) {
|
||||
characters.addAll(line.getCharacters());
|
||||
}
|
||||
characters.sort(Comparator.comparingDouble(Character::getX));
|
||||
|
||||
outputZone.add(new Line(characters, characterSpacing));
|
||||
}
|
||||
|
||||
return new Zone(outputZone.stream()
|
||||
.sorted(Comparator.comparing(Line::getY0))
|
||||
.collect(Collectors.toList()));
|
||||
List<Line> sortedLines = TextPositionOperations.sortLines(groupedLines);
|
||||
return new Zone(sortedLines);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
@ -31,6 +31,6 @@ public class ClassificationDocument {
|
||||
private long rulesVersion;
|
||||
|
||||
private OutlineObjectTree outlineObjectTree;
|
||||
private TableOfContents tableOfContents;
|
||||
private SectionTree sectionTree;
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,19 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
public record DocumentWithVisualization(Document document, LayoutDebugLayer layoutDebugLayer) {
|
||||
|
||||
public Map<NodeType, Long> buildSemanticNodeCounts() {
|
||||
|
||||
return document.streamAllSubNodes()
|
||||
.collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,6 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -13,10 +12,14 @@ import lombok.Getter;
|
||||
public class FloatFrequencyCounter {
|
||||
|
||||
Map<Double, Integer> countPerValue = new HashMap<>();
|
||||
boolean changed;
|
||||
Double mostPopularCache;
|
||||
|
||||
|
||||
public void add(double value) {
|
||||
|
||||
changed = true;
|
||||
|
||||
if (!countPerValue.containsKey(value)) {
|
||||
countPerValue.put(value, 1);
|
||||
} else {
|
||||
@ -27,6 +30,8 @@ public class FloatFrequencyCounter {
|
||||
|
||||
public void addAll(Map<Double, Integer> otherCounter) {
|
||||
|
||||
changed = true;
|
||||
|
||||
for (Map.Entry<Double, Integer> entry : otherCounter.entrySet()) {
|
||||
if (countPerValue.containsKey(entry.getKey())) {
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||
@ -39,27 +44,27 @@ public class FloatFrequencyCounter {
|
||||
|
||||
public Double getMostPopular() {
|
||||
|
||||
Map.Entry<Double, Integer> mostPopular = null;
|
||||
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
if (changed || mostPopularCache == null) {
|
||||
Map.Entry<Double, Integer> mostPopular = null;
|
||||
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
}
|
||||
}
|
||||
mostPopularCache = mostPopular != null ? mostPopular.getKey() : 0;
|
||||
changed = false;
|
||||
}
|
||||
return mostPopular != null ? mostPopular.getKey() : null;
|
||||
|
||||
return mostPopularCache;
|
||||
}
|
||||
|
||||
|
||||
public List<Double> getHigherThanMostPopular() {
|
||||
public List<Double> getValuesInReverseOrder() {
|
||||
|
||||
Double mostPopular = getMostPopular();
|
||||
List<Double> higher = new ArrayList<>();
|
||||
for (Double value : countPerValue.keySet()) {
|
||||
if (value > mostPopular) {
|
||||
higher.add(value);
|
||||
}
|
||||
}
|
||||
|
||||
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
|
||||
return countPerValue.keySet()
|
||||
.stream()
|
||||
.sorted(Collections.reverseOrder())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
|
||||
public class LineInformation {
|
||||
|
||||
List<Rectangle2D> lineBBox;
|
||||
List<List<TextPositionSequence>> sequencesByLines;
|
||||
List<List<Word>> sequencesByLines;
|
||||
List<List<Rectangle2D>> bBoxWithGapsByLines;
|
||||
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
|
||||
List<List<List<Word>>> sequencesWithGapsByLines;
|
||||
|
||||
}
|
||||
|
||||
@ -9,12 +9,14 @@ public enum PageBlockType {
|
||||
H6,
|
||||
HEADER,
|
||||
FOOTER,
|
||||
TITLE,
|
||||
PARAGRAPH,
|
||||
PARAGRAPH_BOLD,
|
||||
PARAGRAPH_ITALIC,
|
||||
PARAGRAPH_UNKNOWN,
|
||||
OTHER,
|
||||
TABLE_OF_CONTENTS_HEADLINE,
|
||||
TABLE_OF_CONTENTS_ITEM,
|
||||
LIST_ITEM,
|
||||
TABLE;
|
||||
|
||||
|
||||
@ -34,7 +36,7 @@ public enum PageBlockType {
|
||||
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||
|
||||
return switch (pageBlockType) {
|
||||
case H1 -> 1;
|
||||
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
|
||||
case H2 -> 2;
|
||||
case H3 -> 3;
|
||||
case H4 -> 4;
|
||||
@ -46,6 +48,6 @@ public enum PageBlockType {
|
||||
|
||||
public boolean isHeadline() {
|
||||
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -15,7 +15,7 @@ import lombok.Getter;
|
||||
@AllArgsConstructor
|
||||
public class PageContents {
|
||||
|
||||
List<TextPositionSequence> sortedTextPositionSequences;
|
||||
List<Word> sortedWords;
|
||||
Rectangle2D cropBox;
|
||||
Rectangle2D mediaBox;
|
||||
List<Ruling> rulings;
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -15,11 +16,13 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SectionIdentifier {
|
||||
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?");
|
||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
||||
|
||||
public enum Format {
|
||||
EMPTY,
|
||||
NUMERICAL,
|
||||
ALPHANUMERIC,
|
||||
DOCUMENT
|
||||
}
|
||||
|
||||
@ -41,6 +44,10 @@ public class SectionIdentifier {
|
||||
if (numericalIdentifierMatcher.find()) {
|
||||
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
|
||||
}
|
||||
Matcher alphanumericIdentifierMatcher = alphanumericIdentifierPattern.matcher(headline);
|
||||
if (alphanumericIdentifierMatcher.find()) {
|
||||
return buildAlphanumericSectionIdentifier(headline, alphanumericIdentifierMatcher);
|
||||
}
|
||||
// more formats here
|
||||
return SectionIdentifier.empty();
|
||||
}
|
||||
@ -75,7 +82,36 @@ public class SectionIdentifier {
|
||||
}
|
||||
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
|
||||
}
|
||||
return new SectionIdentifier(Format.NUMERICAL, identifierString, identifiers.stream().toList(), false);
|
||||
return new SectionIdentifier(Format.NUMERICAL,
|
||||
identifierString,
|
||||
identifiers.stream()
|
||||
.toList(),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
private static SectionIdentifier buildAlphanumericSectionIdentifier(String headline, Matcher alphanumericIdentifierMatcher) {
|
||||
|
||||
String identifierString = headline.substring(alphanumericIdentifierMatcher.start(), alphanumericIdentifierMatcher.end());
|
||||
|
||||
String alphanumericIdentifier = alphanumericIdentifierMatcher.group(0).substring(0, 1).toUpperCase(Locale.ENGLISH);
|
||||
int mappedCharacterValue = alphanumericIdentifier.charAt(0) - 'A' + 1;
|
||||
List<Integer> identifiers = new LinkedList<>();
|
||||
identifiers.add(mappedCharacterValue);
|
||||
|
||||
for (int i = 1; i <= 3; i++) {
|
||||
String numericalIdentifier = alphanumericIdentifierMatcher.group(i);
|
||||
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
|
||||
break;
|
||||
}
|
||||
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
|
||||
}
|
||||
|
||||
return new SectionIdentifier(Format.ALPHANUMERIC,
|
||||
identifierString,
|
||||
identifiers.stream()
|
||||
.toList(),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
@ -123,4 +159,22 @@ public class SectionIdentifier {
|
||||
return identifierString;
|
||||
}
|
||||
|
||||
|
||||
public boolean isEmpty() {
|
||||
|
||||
return this.format.equals(Format.EMPTY);
|
||||
}
|
||||
|
||||
|
||||
public int level() {
|
||||
|
||||
return identifiers.size();
|
||||
}
|
||||
|
||||
|
||||
protected List<Integer> getIdentifiers() {
|
||||
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,94 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
|
||||
public abstract class AbstractNodeVisitor implements NodeVisitor {
|
||||
|
||||
@Override
|
||||
public void visit(Document document) {
|
||||
|
||||
visitChildren(document);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(SuperSection superSection) {
|
||||
|
||||
visitChildren(superSection);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Section section) {
|
||||
|
||||
visitChildren(section);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Headline headline) {
|
||||
|
||||
visitChildren(headline);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Paragraph paragraph) {
|
||||
|
||||
visitChildren(paragraph);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Footer footer) {
|
||||
|
||||
visitChildren(footer);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Header header) {
|
||||
|
||||
visitChildren(header);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Image image) {
|
||||
|
||||
visitChildren(image);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Table table) {
|
||||
|
||||
visitChildren(table);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(TableCell tableCell) {
|
||||
|
||||
visitChildren(tableCell);
|
||||
}
|
||||
|
||||
|
||||
private void visitChildren(SemanticNode semanticNode) {
|
||||
|
||||
semanticNode.streamChildren()
|
||||
.forEach(node -> node.accept(this));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,230 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode
|
||||
public class DocumentTree {
|
||||
|
||||
private final Entry root;
|
||||
|
||||
|
||||
public DocumentTree(Document document) {
|
||||
|
||||
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return allEntriesInOrder().map(Entry::getNode)
|
||||
.filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
|
||||
|
||||
if (!entryExists(parentId)) {
|
||||
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
|
||||
}
|
||||
|
||||
Entry parent = getEntryById(parentId);
|
||||
List<Integer> newId = new LinkedList<>(parentId);
|
||||
newId.add(parent.children.size());
|
||||
parent.children.add(Entry.builder().treeId(newId).node(node).build());
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
private boolean entryExists(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root != null;
|
||||
}
|
||||
Entry entry = root.children.get(treeId.get(0));
|
||||
for (int id : treeId.subList(1, treeId.size())) {
|
||||
if (id >= entry.children.size() || 0 > id) {
|
||||
return false;
|
||||
}
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public Entry getParentEntryById(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(getParentId(treeId));
|
||||
}
|
||||
|
||||
|
| ||||