Compare commits

...

34 Commits

Author SHA1 Message Date
Dominique Eifländer
ef23ee0ade Merge branch 'RED-10752-main' into 'main'
RED-10752: Enabled prometheus

See merge request fforesight/layout-parser!267
2025-01-29 13:34:01 +01:00
Dominique Eifländer
af31f52b47 RED-10752: Enabled prometheus 2025-01-29 11:09:29 +01:00
Kilian Schüttler
b5152112ee Merge branch 'RM-231' into 'main'
RM-231: missing whitespace in name

See merge request fforesight/layout-parser!264
2025-01-14 13:04:10 +01:00
Kilian Schuettler
85ea4ef455 RM-231: missing whitespace in name 2025-01-14 12:59:01 +01:00
Kilian Schüttler
01f8c01fff Merge branch 'RED-10714' into 'main'
RED-10714: fix IndexOutOfBoundsException

See merge request fforesight/layout-parser!262
2025-01-10 12:33:18 +01:00
Kilian Schuettler
0b6a292c75 RED-10714: fix IndexOutOfBoundsException 2025-01-10 12:12:14 +01:00
Maverick Studer
e24020589c Merge branch 'feature/RED-9998' into 'main'
RED-9998: App version history (for conditional re-analyzing the layout of a file)

See merge request fforesight/layout-parser!259
2024-12-12 09:58:46 +01:00
Maverick Studer
c619b845e8 RED-9998: App version history (for conditional re-analyzing the layout of a file) 2024-12-12 09:58:46 +01:00
Kilian Schüttler
ed0371ca11 Merge branch 'RED-10127' into 'main'
RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines

See merge request fforesight/layout-parser!257
2024-12-06 14:49:48 +01:00
Kilian Schuettler
89b5be8d67 RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines 2024-12-06 13:41:44 +01:00
Kilian Schuettler
077ce60c9d RED-9139: update document version 2024-11-15 16:48:56 +01:00
Kilian Schüttler
ab171be6e2 Merge branch 'feature/RED-9139' into 'main'
RED-9139: more robust TOC detection

See merge request fforesight/layout-parser!253
2024-11-14 16:50:52 +01:00
Kilian Schuettler
664b47b4c3 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:49 +01:00
Kilian Schuettler
8005c1f25f RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
42185a95a0 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
51b42efaf6 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6a50d45947 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
073ac12cf7 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
84b054a4cc RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
905b65a5fa RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
7617c1f308 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
2b3936c09b RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6e5b1f1978 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
cf846d18bc RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
25c46f16ac RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
96acefed78 RED-9139: move document to module in redaction-service
* add TableOfContents node
2024-11-14 16:39:48 +01:00
Kilian Schuettler
366241e6c6 RED-9139: move document to module in redaction-service
* add TableOfContents node
2024-11-14 16:39:48 +01:00
Kilian Schuettler
7f472ccc52 RED-9139: move document to module in redaction-service
* add TableOfContents node
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6f807c7d94 RED-9139: add new TableOfContents Node
* rename previous TableOfContent to SectionTree
* added protobuf compile script
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6e04c15f3d RED-9139: add new TableOfContents Node
* rename previous TableOfContent to SectionTree
* added protobuf compile script
2024-11-14 16:39:48 +01:00
Kilian Schuettler
1384584e2f RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
2024-11-14 16:39:46 +01:00
Kilian Schuettler
e58011e111 RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
2024-11-14 16:39:21 +01:00
Kilian Schüttler
a821570065 Merge branch 'RED-9139-bp' into 'main'
RED-9139: more robust TOC detection

See merge request fforesight/layout-parser!254
2024-11-13 10:54:39 +01:00
Kilian Schüttler
7ee1f9e360 RED-9139: more robust TOC detection 2024-11-13 10:54:39 +01:00
130 changed files with 956 additions and 16955 deletions

View File

@ -8,6 +8,8 @@ plugins {
group = "com.knecon.fforesight" group = "com.knecon.fforesight"
val documentVersion by rootProject.extra { "4.433.0" }
java.sourceCompatibility = JavaVersion.VERSION_17 java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17 java.targetCompatibility = JavaVersion.VERSION_17

View File

@ -7,5 +7,4 @@ description = "layoutparser-service-internal-api"
dependencies { dependencies {
implementation("io.swagger.core.v3:swagger-annotations:2.2.15") implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
} }

View File

@ -1,43 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@Schema(description = "Object containing the complete document layout parsing information. It is split into 4 categories, structure, text, positions and pages: "
+ "The document tree structure of SemanticNodes such as Section, Paragraph, Headline, etc. "
+ "The text, which is stored as separate blocks of data. "
+ "The text positions, which are also stored as separate blocks. The Blocks are equal to the text blocks in length and order. "
+ "The page information.")
public class DocumentData implements Serializable {
@Schema(description = "Contains information about the document's pages.")
AllDocumentPages documentPages;
@Schema(description = "Contains information about the document's text.")
AllDocumentTextData documentTextData;
@Schema(description = "Contains information about the document's text positions.")
AllDocumentPositionData documentPositions;
@Schema(description = "Contains information about the document's semantic structure.")
DocumentStructureWrapper documentStructureWrapper;
public DocumentStructure getDocumentStructure() {
return documentStructureWrapper.getDocumentStructure();
}
}

View File

@ -1,31 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing information about the document's pages.")
public class DocumentPage implements Serializable {
@Schema(description = "The page number, starting with 1.")
int number;
@Schema(description = "The page height in PDF user units.", example = "792")
int height;
@Schema(description = "The page width in PDF user units.", example = "694")
int width;
@Schema(description = "The page rotation as specified by the PDF.", example = "90", allowableValues = {"0", "90", "180", "270"})
int rotation;
}

View File

@ -1,29 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing text positional information of a specific text block. A document is split into multiple text blocks, which are supposed to be read in order. Every text block can only occur on a single page.")
public class DocumentPositionData implements Serializable {
@Schema(description = "Identifier of the text block.")
Long id;
@Schema(description = "For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate. This is required due to the text and position coordinates not being equal.")
int[] stringIdxToPositionIdx;
@Schema(description = "The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block. The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner. In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.")
float[][] positions;
}

View File

@ -1,173 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.awt.geom.Rectangle2D;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing information about the parsed tree structure of the SemanticNodes, such as Section, Paragraph, Headline etc inside of the document.")
public class DocumentStructure implements Serializable {
@Schema(description = "The root EntryData represents the Document.")
EntryData root;
@Schema(description = "Object containing the extra field names, a table has in its properties field.")
public static class TableProperties implements Serializable {
public static final String NUMBER_OF_ROWS = "numberOfRows";
public static final String NUMBER_OF_COLS = "numberOfCols";
}
@Schema(description = "Object containing the extra field names, an Image has in its properties field.")
public static class ImageProperties implements Serializable {
public static final String TRANSPARENT = "transparent";
public static final String IMAGE_TYPE = "imageType";
public static final String POSITION = "position";
public static final String ID = "id";
public static final String REPRESENTATION_HASH = "representationHash";
}
@Schema(description = "Object containing the extra field names, a table cell has in its properties field.")
public static class TableCellProperties implements Serializable {
public static final String B_BOX = "bBox";
public static final String ROW = "row";
public static final String COL = "col";
public static final String HEADER = "header";
}
@Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.")
public static class DuplicateParagraphProperties implements Serializable {
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
}
public static final String RECTANGLE_DELIMITER = ";";
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER))
.map(Float::parseFloat)
.toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public static double[] parseRepresentationVector(String representationHash) {
String[] stringArray = representationHash.split("[,\\s]+");
double[] doubleArray = new double[stringArray.length];
for (int i = 0; i < stringArray.length; i++) {
doubleArray[i] = Double.parseDouble(stringArray[i]);
}
return doubleArray;
}
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root;
}
EntryData entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(root), root.children.stream())
.flatMap(DocumentStructure::flatten);
}
public String toString() {
return String.join("\n",
streamAllEntries().map(EntryData::toString)
.toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry),
entry.children.stream()
.flatMap(DocumentStructure::flatten));
}
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing information of a SemanticNode and also structuring the layout with children.")
public static class EntryData implements Serializable {
@Schema(description = "Type of the semantic node.", allowableValues = {"DOCUMENT", "SECTION", "PARAGRAPH", "HEADLINE", "TABLE", "TABLE_CELL", "HEADER", "FOOTER", "IMAGE"})
NodeType type;
@Schema(description = "Specifies the position in the parsed tree structure.", example = "[1, 0, 2]")
int[] treeId;
@Schema(description = "Specifies the text block IDs associated with this semantic node. The value should be joined with the DocumentTextData/DocumentPositionData. Is empty, if no text block is directly associated with this semantic node. Only Paragraph, Headline, Header or Footer is directly associated with a text block.", example = "[1]")
Long[] atomicBlockIds;
@Schema(description = "Specifies the pages this semantic node appears on. The value should be joined with the PageData.", example = "[1, 2, 3]")
Long[] pageNumbers;
@Schema(description = "Some semantic nodes have additional information, this information is stored in this Map. The extra fields are specified by the Properties subclasses.", example = "For a Table: {\"numberOfRows\": 3, \"numberOfCols\": 4}")
Map<String, String> properties;
@Schema(description = "All child Entries of this Entry.", example = "[1, 2, 3]")
List<EntryData> children;
@Schema(description = "Describes the origin of the semantic node", example = "[ALGORITHM]")
Set<LayoutEngine> engines;
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (int i : treeId) {
sb.append(i);
sb.append(",");
}
sb.delete(sb.length() - 1, sb.length());
sb.append("]: ");
sb.append(type);
sb.append(" atbs = ");
sb.append(atomicBlockIds.length);
return sb.toString();
}
}
}

View File

@ -1,799 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: DocumentStructure.proto
// Protobuf Java Version: 4.27.1
@SuppressWarnings("all")
public final class DocumentStructureProto {
private DocumentStructureProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", DocumentStructureProto.class.getName());
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
}
public interface DocumentStructureOrBuilder extends
// @@protoc_insertion_point(interface_extends:DocumentStructure)
com.google.protobuf.MessageOrBuilder {
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return Whether the root field is set.
*/
boolean hasRoot();
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return The root.
*/
EntryDataProto.EntryData getRoot();
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
EntryDataProto.EntryDataOrBuilder getRootOrBuilder();
}
/**
* Protobuf type {@code DocumentStructure}
*/
public static final class DocumentStructure extends com.google.protobuf.GeneratedMessage implements
// @@protoc_insertion_point(message_implements:DocumentStructure)
DocumentStructureOrBuilder {
private static final long serialVersionUID = 0L;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", DocumentStructure.class.getName());
}
// Use DocumentStructure.newBuilder() to construct.
private DocumentStructure(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
super(builder);
}
private DocumentStructure() {
}
public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
return DocumentStructureProto.internal_static_DocumentStructure_descriptor;
}
@Override
protected FieldAccessorTable internalGetFieldAccessorTable() {
return DocumentStructureProto.internal_static_DocumentStructure_fieldAccessorTable.ensureFieldAccessorsInitialized(DocumentStructure.class, Builder.class);
}
private int bitField0_;
public static final int ROOT_FIELD_NUMBER = 1;
private EntryDataProto.EntryData root_;
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return Whether the root field is set.
*/
@Override
public boolean hasRoot() {
return ((bitField0_ & 0x00000001) != 0);
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return The root.
*/
@Override
public EntryDataProto.EntryData getRoot() {
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
@Override
public EntryDataProto.EntryDataOrBuilder getRootOrBuilder() {
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
}
private byte memoizedIsInitialized = -1;
@Override
public final boolean isInitialized() {
byte isInitialized = memoizedIsInitialized;
if (isInitialized == 1) {
return true;
}
if (isInitialized == 0) {
return false;
}
memoizedIsInitialized = 1;
return true;
}
@Override
public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
if (((bitField0_ & 0x00000001) != 0)) {
output.writeMessage(1, getRoot());
}
getUnknownFields().writeTo(output);
}
@Override
public int getSerializedSize() {
int size = memoizedSize;
if (size != -1) {
return size;
}
size = 0;
if (((bitField0_ & 0x00000001) != 0)) {
size += com.google.protobuf.CodedOutputStream.computeMessageSize(1, getRoot());
}
size += getUnknownFields().getSerializedSize();
memoizedSize = size;
return size;
}
@Override
public boolean equals(final Object obj) {
if (obj == this) {
return true;
}
if (!(obj instanceof DocumentStructure)) {
return super.equals(obj);
}
DocumentStructure other = (DocumentStructure) obj;
if (hasRoot() != other.hasRoot()) {
return false;
}
if (hasRoot()) {
if (!getRoot().equals(other.getRoot())) {
return false;
}
}
if (!getUnknownFields().equals(other.getUnknownFields())) {
return false;
}
return true;
}
@Override
public int hashCode() {
if (memoizedHashCode != 0) {
return memoizedHashCode;
}
int hash = 41;
hash = (19 * hash) + getDescriptor().hashCode();
if (hasRoot()) {
hash = (37 * hash) + ROOT_FIELD_NUMBER;
hash = (53 * hash) + getRoot().hashCode();
}
hash = (29 * hash) + getUnknownFields().hashCode();
memoizedHashCode = hash;
return hash;
}
public static DocumentStructure parseFrom(java.nio.ByteBuffer data) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static DocumentStructure parseFrom(java.nio.ByteBuffer data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static DocumentStructure parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static DocumentStructure parseFrom(com.google.protobuf.ByteString data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static DocumentStructure parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static DocumentStructure parseFrom(byte[] data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static DocumentStructure parseFrom(java.io.InputStream input) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input);
}
public static DocumentStructure parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input, extensionRegistry);
}
public static DocumentStructure parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseDelimitedWithIOException(PARSER, input);
}
public static DocumentStructure parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseDelimitedWithIOException(PARSER, input, extensionRegistry);
}
public static DocumentStructure parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input);
}
public static DocumentStructure parseFrom(com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input, extensionRegistry);
}
@Override
public Builder newBuilderForType() {return newBuilder();}
public static Builder newBuilder() {
return DEFAULT_INSTANCE.toBuilder();
}
public static Builder newBuilder(DocumentStructure prototype) {
return DEFAULT_INSTANCE.toBuilder().mergeFrom(prototype);
}
@Override
public Builder toBuilder() {
return this == DEFAULT_INSTANCE ? new Builder() : new Builder().mergeFrom(this);
}
@Override
protected Builder newBuilderForType(BuilderParent parent) {
Builder builder = new Builder(parent);
return builder;
}
/**
* Protobuf type {@code DocumentStructure}
*/
public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> implements
// @@protoc_insertion_point(builder_implements:DocumentStructure)
DocumentStructureOrBuilder {
public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
return DocumentStructureProto.internal_static_DocumentStructure_descriptor;
}
@Override
protected FieldAccessorTable internalGetFieldAccessorTable() {
return DocumentStructureProto.internal_static_DocumentStructure_fieldAccessorTable.ensureFieldAccessorsInitialized(DocumentStructure.class, Builder.class);
}
// Construct using DocumentStructureOuterClass.DocumentStructure.newBuilder()
private Builder() {
maybeForceBuilderInitialization();
}
private Builder(BuilderParent parent) {
super(parent);
maybeForceBuilderInitialization();
}
private void maybeForceBuilderInitialization() {
if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
getRootFieldBuilder();
}
}
@Override
public Builder clear() {
super.clear();
bitField0_ = 0;
root_ = null;
if (rootBuilder_ != null) {
rootBuilder_.dispose();
rootBuilder_ = null;
}
return this;
}
@Override
public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
return DocumentStructureProto.internal_static_DocumentStructure_descriptor;
}
@Override
public DocumentStructure getDefaultInstanceForType() {
return DocumentStructure.getDefaultInstance();
}
@Override
public DocumentStructure build() {
DocumentStructure result = buildPartial();
if (!result.isInitialized()) {
throw newUninitializedMessageException(result);
}
return result;
}
@Override
public DocumentStructure buildPartial() {
DocumentStructure result = new DocumentStructure(this);
if (bitField0_ != 0) {
buildPartial0(result);
}
onBuilt();
return result;
}
private void buildPartial0(DocumentStructure result) {
int from_bitField0_ = bitField0_;
int to_bitField0_ = 0;
if (((from_bitField0_ & 0x00000001) != 0)) {
result.root_ = rootBuilder_ == null ? root_ : rootBuilder_.build();
to_bitField0_ |= 0x00000001;
}
result.bitField0_ |= to_bitField0_;
}
@Override
public Builder mergeFrom(com.google.protobuf.Message other) {
if (other instanceof DocumentStructure) {
return mergeFrom((DocumentStructure) other);
} else {
super.mergeFrom(other);
return this;
}
}
public Builder mergeFrom(DocumentStructure other) {
if (other == DocumentStructure.getDefaultInstance()) {
return this;
}
if (other.hasRoot()) {
mergeRoot(other.getRoot());
}
this.mergeUnknownFields(other.getUnknownFields());
onChanged();
return this;
}
@Override
public final boolean isInitialized() {
return true;
}
@Override
public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
if (extensionRegistry == null) {
throw new NullPointerException();
}
try {
boolean done = false;
while (!done) {
int tag = input.readTag();
switch (tag) {
case 0:
done = true;
break;
case 10: {
input.readMessage(getRootFieldBuilder().getBuilder(), extensionRegistry);
bitField0_ |= 0x00000001;
break;
} // case 10
default: {
if (!super.parseUnknownField(input, extensionRegistry, tag)) {
done = true; // was an endgroup tag
}
break;
} // default:
} // switch (tag)
} // while (!done)
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
throw e.unwrapIOException();
} finally {
onChanged();
} // finally
return this;
}
private int bitField0_;
private EntryDataProto.EntryData root_;
private com.google.protobuf.SingleFieldBuilder<EntryDataProto.EntryData, EntryDataProto.EntryData.Builder, EntryDataProto.EntryDataOrBuilder> rootBuilder_;
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return Whether the root field is set.
*/
public boolean hasRoot() {
return ((bitField0_ & 0x00000001) != 0);
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return The root.
*/
public EntryDataProto.EntryData getRoot() {
if (rootBuilder_ == null) {
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
} else {
return rootBuilder_.getMessage();
}
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder setRoot(EntryDataProto.EntryData value) {
if (rootBuilder_ == null) {
if (value == null) {
throw new NullPointerException();
}
root_ = value;
} else {
rootBuilder_.setMessage(value);
}
bitField0_ |= 0x00000001;
onChanged();
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder setRoot(EntryDataProto.EntryData.Builder builderForValue) {
if (rootBuilder_ == null) {
root_ = builderForValue.build();
} else {
rootBuilder_.setMessage(builderForValue.build());
}
bitField0_ |= 0x00000001;
onChanged();
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder mergeRoot(EntryDataProto.EntryData value) {
if (rootBuilder_ == null) {
if (((bitField0_ & 0x00000001) != 0) && root_ != null && root_ != EntryDataProto.EntryData.getDefaultInstance()) {
getRootBuilder().mergeFrom(value);
} else {
root_ = value;
}
} else {
rootBuilder_.mergeFrom(value);
}
if (root_ != null) {
bitField0_ |= 0x00000001;
onChanged();
}
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder clearRoot() {
bitField0_ = (bitField0_ & ~0x00000001);
root_ = null;
if (rootBuilder_ != null) {
rootBuilder_.dispose();
rootBuilder_ = null;
}
onChanged();
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public EntryDataProto.EntryData.Builder getRootBuilder() {
bitField0_ |= 0x00000001;
onChanged();
return getRootFieldBuilder().getBuilder();
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public EntryDataProto.EntryDataOrBuilder getRootOrBuilder() {
if (rootBuilder_ != null) {
return rootBuilder_.getMessageOrBuilder();
} else {
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
}
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
private com.google.protobuf.SingleFieldBuilder<EntryDataProto.EntryData, EntryDataProto.EntryData.Builder, EntryDataProto.EntryDataOrBuilder> getRootFieldBuilder() {
if (rootBuilder_ == null) {
rootBuilder_ = new com.google.protobuf.SingleFieldBuilder<EntryDataProto.EntryData, EntryDataProto.EntryData.Builder, EntryDataProto.EntryDataOrBuilder>(getRoot(),
getParentForChildren(),
isClean());
root_ = null;
}
return rootBuilder_;
}
// @@protoc_insertion_point(builder_scope:DocumentStructure)
}
// @@protoc_insertion_point(class_scope:DocumentStructure)
private static final DocumentStructure DEFAULT_INSTANCE;
static {
DEFAULT_INSTANCE = new DocumentStructure();
}
public static DocumentStructure getDefaultInstance() {
return DEFAULT_INSTANCE;
}
private static final com.google.protobuf.Parser<DocumentStructure> PARSER = new com.google.protobuf.AbstractParser<DocumentStructure>() {
@Override
public DocumentStructure parsePartialFrom(com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
Builder builder = newBuilder();
try {
builder.mergeFrom(input, extensionRegistry);
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
throw e.setUnfinishedMessage(builder.buildPartial());
} catch (com.google.protobuf.UninitializedMessageException e) {
throw e.asInvalidProtocolBufferException().setUnfinishedMessage(builder.buildPartial());
} catch (java.io.IOException e) {
throw new com.google.protobuf.InvalidProtocolBufferException(e).setUnfinishedMessage(builder.buildPartial());
}
return builder.buildPartial();
}
};
public static com.google.protobuf.Parser<DocumentStructure> parser() {
return PARSER;
}
@Override
public com.google.protobuf.Parser<DocumentStructure> getParserForType() {
return PARSER;
}
@Override
public DocumentStructure getDefaultInstanceForType() {
return DEFAULT_INSTANCE;
}
}
private static final com.google.protobuf.Descriptors.Descriptor internal_static_DocumentStructure_descriptor;
private static final com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_DocumentStructure_fieldAccessorTable;
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
static {
String[] descriptorData = {"\n\027DocumentStructure.proto\032\017EntryData.pro"
+ "to\"-\n\021DocumentStructure\022\030\n\004root\030\001 \001(\0132\n."
+ "EntryDatab\006proto3"};
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData,
new com.google.protobuf.Descriptors.FileDescriptor[]{EntryDataProto.getDescriptor(),});
internal_static_DocumentStructure_descriptor = getDescriptor().getMessageTypes()
.get(0);
internal_static_DocumentStructure_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(internal_static_DocumentStructure_descriptor,
new String[]{"Root",});
descriptor.resolveAllFeaturesImmutable();
EntryDataProto.getDescriptor();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -1,126 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import java.awt.geom.Rectangle2D;
import java.io.ObjectStreamException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Getter;
@Getter
@AllArgsConstructor
public class DocumentStructureWrapper implements Serializable {
private final DocumentStructure documentStructure;
@Schema(description = "Object containing the extra field names, a table has in its properties field.")
public static class TableProperties implements Serializable {
public static final String NUMBER_OF_ROWS = "numberOfRows";
public static final String NUMBER_OF_COLS = "numberOfCols";
}
@Schema(description = "Object containing the extra field names, an Image has in its properties field.")
public static class ImageProperties implements Serializable {
public static final String TRANSPARENT = "transparent";
public static final String IMAGE_TYPE = "imageType";
public static final String POSITION = "position";
public static final String ID = "id";
public static final String REPRESENTATION_HASH = "representationHash";
}
@Schema(description = "Object containing the extra field names, a table cell has in its properties field.")
public static class TableCellProperties implements Serializable {
public static final String B_BOX = "bBox";
public static final String ROW = "row";
public static final String COL = "col";
public static final String HEADER = "header";
}
@Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.")
public static class DuplicateParagraphProperties implements Serializable {
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
}
public static final String RECTANGLE_DELIMITER = ";";
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER))
.map(Float::parseFloat)
.toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public static double[] parseRepresentationVector(String representationHash) {
String[] stringArray = representationHash.split("[,\\s]+");
double[] doubleArray = new double[stringArray.length];
for (int i = 0; i < stringArray.length; i++) {
doubleArray[i] = Double.parseDouble(stringArray[i]);
}
return doubleArray;
}
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return documentStructure.getRoot();
}
EntryData entry = documentStructure.getRoot().getChildrenList()
.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.getChildrenList()
.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(documentStructure.getRoot()),
documentStructure.getRoot().getChildrenList()
.stream())
.flatMap(DocumentStructureWrapper::flatten);
}
public String toString() {
return String.join("\n",
streamAllEntries().map(EntryData::toString)
.toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry),
entry.getChildrenList()
.stream()
.flatMap(DocumentStructureWrapper::flatten));
}
}

View File

@ -1,37 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing text information of a specific text block. A document is split into multiple text blocks, which are supposed to be read in order. Every text block can only occur on a single page.")
public class DocumentTextData implements Serializable {
@Schema(description = "Identifier of the text block.")
Long id;
@Schema(description = "The page the text block occurs on.")
Long page;
@Schema(description = "The text the text block.")
String searchText;
@Schema(description = "Each text block is assigned a number on a page, starting from 0.")
int numberOnPage;
@Schema(description = "The text blocks are ordered, this number represents the start of the text block as a string offset.")
int start;
@Schema(description = "The text blocks are ordered, this number represents the end of the text block as a string offset.")
int end;
@Schema(description = "The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.", example = "[5, 10]")
int[] lineBreaks;
}

View File

@ -1,8 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
@Deprecated
public enum LayoutEngine {
ALGORITHM,
AI,
OUTLINE
}

View File

@ -1,193 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: LayoutEngine.proto
// Protobuf Java Version: 4.27.1
@SuppressWarnings("all")
public final class LayoutEngineProto {
private LayoutEngineProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", LayoutEngineProto.class.getName());
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code LayoutEngine}
*/
public enum LayoutEngine implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>ALGORITHM = 0;</code>
*/
ALGORITHM(0),
/**
* <code>AI = 1;</code>
*/
AI(1),
/**
* <code>OUTLINE = 2;</code>
*/
OUTLINE(2),
UNRECOGNIZED(-1),
;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", LayoutEngine.class.getName());
}
/**
* <code>ALGORITHM = 0;</code>
*/
public static final int ALGORITHM_VALUE = 0;
/**
* <code>AI = 1;</code>
*/
public static final int AI_VALUE = 1;
/**
* <code>OUTLINE = 2;</code>
*/
public static final int OUTLINE_VALUE = 2;
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
}
return value;
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
@Deprecated
public static LayoutEngine valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static LayoutEngine forNumber(int value) {
switch (value) {
case 0:
return ALGORITHM;
case 1:
return AI;
case 2:
return OUTLINE;
default:
return null;
}
}
public static com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>() {
public LayoutEngine findValueByNumber(int number) {
return LayoutEngine.forNumber(number);
}
};
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues()
.get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
return LayoutEngineProto.getDescriptor().getEnumTypes()
.get(0);
}
private static final LayoutEngine[] VALUES = values();
public static LayoutEngine valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private LayoutEngine(int value) {
this.value = value;
}
// @@protoc_insertion_point(enum_scope:LayoutEngine)
}
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
static {
String[] descriptorData = {"\n\022LayoutEngine.proto*2\n\014LayoutEngine\022\r\n\t" + "ALGORITHM\020\000\022\006\n\002AI\020\001\022\013\n\007OUTLINE\020\002b\006proto3"};
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
descriptor.resolveAllFeaturesImmutable();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -1,24 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import java.util.Locale;
@Deprecated
public enum NodeType implements Serializable {
DOCUMENT,
SECTION,
SUPER_SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
}

View File

@ -1,274 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.util.Locale;
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: NodeType.proto
// Protobuf Java Version: 4.27.1
@SuppressWarnings("all")
public final class NodeTypeProto {
private NodeTypeProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", NodeTypeProto.class.getName());
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code NodeType}
*/
public enum NodeType implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>DOCUMENT = 0;</code>
*/
DOCUMENT(0),
/**
* <code>SECTION = 1;</code>
*/
SECTION(1),
/**
* <code>SUPER_SECTION = 2;</code>
*/
SUPER_SECTION(2),
/**
* <code>HEADLINE = 3;</code>
*/
HEADLINE(3),
/**
* <code>PARAGRAPH = 4;</code>
*/
PARAGRAPH(4),
/**
* <code>TABLE = 5;</code>
*/
TABLE(5),
/**
* <code>TABLE_CELL = 6;</code>
*/
TABLE_CELL(6),
/**
* <code>IMAGE = 7;</code>
*/
IMAGE(7),
/**
* <code>HEADER = 8;</code>
*/
HEADER(8),
/**
* <code>FOOTER = 9;</code>
*/
FOOTER(9),
UNRECOGNIZED(-1),
;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", NodeType.class.getName());
}
/**
* <code>DOCUMENT = 0;</code>
*/
public static final int DOCUMENT_VALUE = 0;
/**
* <code>SECTION = 1;</code>
*/
public static final int SECTION_VALUE = 1;
/**
* <code>SUPER_SECTION = 2;</code>
*/
public static final int SUPER_SECTION_VALUE = 2;
/**
* <code>HEADLINE = 3;</code>
*/
public static final int HEADLINE_VALUE = 3;
/**
* <code>PARAGRAPH = 4;</code>
*/
public static final int PARAGRAPH_VALUE = 4;
/**
* <code>TABLE = 5;</code>
*/
public static final int TABLE_VALUE = 5;
/**
* <code>TABLE_CELL = 6;</code>
*/
public static final int TABLE_CELL_VALUE = 6;
/**
* <code>IMAGE = 7;</code>
*/
public static final int IMAGE_VALUE = 7;
/**
* <code>HEADER = 8;</code>
*/
public static final int HEADER_VALUE = 8;
/**
* <code>FOOTER = 9;</code>
*/
public static final int FOOTER_VALUE = 9;
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
}
return value;
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
@Deprecated
public static NodeType valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static NodeType forNumber(int value) {
switch (value) {
case 0:
return DOCUMENT;
case 1:
return SECTION;
case 2:
return SUPER_SECTION;
case 3:
return HEADLINE;
case 4:
return PARAGRAPH;
case 5:
return TABLE;
case 6:
return TABLE_CELL;
case 7:
return IMAGE;
case 8:
return HEADER;
case 9:
return FOOTER;
default:
return null;
}
}
public static com.google.protobuf.Internal.EnumLiteMap<NodeType> internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<NodeType> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<NodeType>() {
public NodeType findValueByNumber(int number) {
return NodeType.forNumber(number);
}
};
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues()
.get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
return NodeTypeProto.getDescriptor().getEnumTypes()
.get(0);
}
private static final NodeType[] VALUES = values();
public static NodeType valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private NodeType(int value) {
this.value = value;
}
// @@protoc_insertion_point(enum_scope:NodeType)
}
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
static {
String[] descriptorData = {"\n\016NodeType.proto*\223\001\n\010NodeType\022\014\n\010DOCUMEN"
+ "T\020\000\022\013\n\007SECTION\020\001\022\021\n\rSUPER_SECTION\020\002\022\014\n\010H"
+ "EADLINE\020\003\022\r\n\tPARAGRAPH\020\004\022\t\n\005TABLE\020\005\022\016\n\nT"
+ "ABLE_CELL\020\006\022\t\n\005IMAGE\020\007\022\n\n\006HEADER\020\010\022\n\n\006FO"
+ "OTER\020\tb\006proto3"};
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
descriptor.resolveAllFeaturesImmutable();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -8,13 +8,20 @@ import lombok.Builder;
@Builder @Builder
@Schema(description = "Object containing information about the layout parsing.") @Schema(description = "Object containing information about the layout parsing.")
public record LayoutParsingFinishedEvent( public record LayoutParsingFinishedEvent(
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") @Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") //
Map<String, String> identifier,// Map<String, String> identifier,
@Schema(description = "The duration of a single layout parsing in ms.")
long duration,// @Schema(description = "The duration of a single layout parsing in ms.") //
@Schema(description = "The number of pages of the parsed document.") long duration,
int numberOfPages,//
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") @Schema(description = "The number of pages of the parsed document.") //
String message) { int numberOfPages,
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
String message,
@Schema(description = "The app version of the layout parser.") //
String layoutParserVersion
) {
} }

View File

@ -1,21 +0,0 @@
syntax = "proto3";
message AllDocumentPages {
repeated DocumentPage documentPages = 1;
}
message DocumentPage {
// The page number, starting with 1.
int32 number = 1;
// The page height in PDF user units.
int32 height = 2;
// The page width in PDF user units.
int32 width = 3;
// The page rotation as specified by the PDF.
int32 rotation = 4;
}

View File

@ -1,25 +0,0 @@
syntax = "proto3";
message AllDocumentPositionData {
repeated DocumentPositionData documentPositionData = 1;
}
message DocumentPositionData {
// Identifier of the text block.
int64 id = 1;
// For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate.
// This is required due to the text and position coordinates not being equal.
repeated int32 stringIdxToPositionIdx = 2;
// The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block.
// The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner.
// In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.
repeated Position positions = 3;
// Definition of a BoundingBox that contains x, y, width, and height.
message Position {
repeated float value = 1;
}
}

View File

@ -1,8 +0,0 @@
syntax = "proto3";
import "EntryData.proto";
message DocumentStructure {
// The root EntryData represents the Document.
EntryData root = 1;
}

View File

@ -1,29 +0,0 @@
syntax = "proto3";
message AllDocumentTextData {
repeated DocumentTextData documentTextData = 1;
}
message DocumentTextData {
// Identifier of the text block.
int64 id = 1;
// The page the text block occurs on.
int64 page = 2;
// The text of the text block.
string searchText = 3;
// Each text block is assigned a number on a page, starting from 0.
int32 numberOnPage = 4;
// The text blocks are ordered, this number represents the start of the text block as a string offset.
int32 start = 5;
// The text blocks are ordered, this number represents the end of the text block as a string offset.
int32 end = 6;
// The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.
repeated int32 lineBreaks = 7;
}

View File

@ -1,27 +0,0 @@
syntax = "proto3";
import "LayoutEngine.proto";
import "NodeType.proto";
message EntryData {
// Type of the semantic node.
NodeType type = 1;
// Specifies the position in the parsed tree structure.
repeated int32 treeId = 2;
// Specifies the text block IDs associated with this semantic node.
repeated int64 atomicBlockIds = 3;
// Specifies the pages this semantic node appears on.
repeated int64 pageNumbers = 4;
// Some semantic nodes have additional information, this information is stored in this Map.
map<string, string> properties = 5;
// All child Entries of this Entry.
repeated EntryData children = 6;
// Describes the origin of the semantic node.
repeated LayoutEngine engines = 7;
}

View File

@ -1,7 +0,0 @@
syntax = "proto3";
enum LayoutEngine {
ALGORITHM = 0;
AI = 1;
OUTLINE = 2;
}

View File

@ -1,14 +0,0 @@
syntax = "proto3";
enum NodeType {
DOCUMENT = 0;
SECTION = 1;
SUPER_SECTION = 2;
HEADLINE = 3;
PARAGRAPH = 4;
TABLE = 5;
TABLE_CELL = 6;
IMAGE = 7;
HEADER = 8;
FOOTER = 9;
}

View File

@ -8,10 +8,12 @@ description = "layoutparser-service-processor"
val jacksonVersion = "2.15.2" val jacksonVersion = "2.15.2"
val pdfBoxVersion = "3.0.0" val pdfBoxVersion = "3.0.0"
dependencies { dependencies {
implementation(project(":layoutparser-service-internal-api")) implementation(project(":layoutparser-service-internal-api"))
implementation(project(":viewer-doc-processor")) implementation(project(":viewer-doc-processor"))
implementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}")
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") { implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") {
exclude("org.springframework.boot", "spring-boot-starter-security") exclude("org.springframework.boot", "spring-boot-starter-security")
exclude("org.springframework.boot", "spring-boot-starter-validation") exclude("org.springframework.boot", "spring-boot-starter-validation")
@ -35,6 +37,4 @@ dependencies {
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0") implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
implementation("com.pdftron:PDFNet:10.11.0") implementation("com.pdftron:PDFNet:10.11.0")
implementation("org.apache.commons:commons-text:1.12.0") implementation("org.apache.commons:commons-text:1.12.0")
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
} }

View File

@ -13,9 +13,8 @@ import lombok.experimental.FieldDefaults;
@Configuration @Configuration
@ConfigurationProperties("layoutparser") @ConfigurationProperties("layoutparser")
@FieldDefaults(level = AccessLevel.PRIVATE) @FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutparserSettings { public class LayoutParserSettings {
boolean debug; boolean debug;
LayoutParsingType layoutParsingTypeOverride; LayoutParsingType layoutParsingTypeOverride;
String pdftronLicense;
} }

View File

@ -20,13 +20,17 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType; import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
@ -35,14 +39,12 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Mark
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -66,7 +68,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.blockificat
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
@ -87,29 +88,32 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutParsingPipeline { public class LayoutParsingPipeline {
ImageServiceResponseAdapter imageServiceResponseAdapter; final ImageServiceResponseAdapter imageServiceResponseAdapter;
CvTableParsingAdapter cvTableParsingAdapter; final CvTableParsingAdapter cvTableParsingAdapter;
LayoutParsingStorageService layoutParsingStorageService; final LayoutParsingStorageService layoutParsingStorageService;
SectionsBuilderService sectionsBuilderService; final SectionsBuilderService sectionsBuilderService;
SimplifiedSectionTextService simplifiedSectionTextService; final SimplifiedSectionTextService simplifiedSectionTextService;
RulingCleaningService rulingCleaningService; final RulingCleaningService rulingCleaningService;
TableExtractionService tableExtractionService; final TableExtractionService tableExtractionService;
DocuMineBlockificationService docuMineBlockificationService; final DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService; final RedactManagerBlockificationService redactManagerBlockificationService;
BlockificationPostprocessingService blockificationPostprocessingService; final BlockificationPostprocessingService blockificationPostprocessingService;
DocstrumBlockificationService docstrumBlockificationService; final DocstrumBlockificationService docstrumBlockificationService;
LayoutGridService layoutGridService; final LayoutGridService layoutGridService;
ObservationRegistry observationRegistry; final ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter; final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
GraphicExtractorService graphicExtractorService; final GraphicExtractorService graphicExtractorService;
OutlineExtractorService outlineExtractorService; final OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService; final SectionTreeBuilderService sectionTreeBuilderService;
TOCEnrichmentService tocEnrichmentService; final SectionTreeEnhancementService sectionTreeEnhancementService;
LayoutparserSettings settings; final LayoutParserSettings settings;
ClassificationService classificationService; final ClassificationService classificationService;
@Value("${LAYOUT_PARSER_VERSION:}")
private String layoutParserVersion;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -118,17 +122,23 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId() VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse()); .map(layoutParsingStorageService::getVisualLayoutParsingFile)
.orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId() ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse()); .map(layoutParsingStorageService::getImagesFile)
.orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId() TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse()); .map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null // LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
originFile, originFile,
imageServiceResponse, imageServiceResponse,
tableServiceResponse, tableServiceResponse,
@ -137,25 +147,26 @@ public class LayoutParsingPipeline {
log.info("Building document graph for {}", layoutParsingRequest.identifier()); log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null // DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument);
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false); layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false);
log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) { if (layoutParsingRequest.documentMarkdownFileStorageId()
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph)); .isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
} }
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
if (layoutParsingRequest.researchDocumentStorageId() != null) { if (layoutParsingRequest.researchDocumentStorageId() != null) {
log.info("Building research document data for {}", layoutParsingRequest.identifier()); log.info("Building research document data for {}", layoutParsingRequest.identifier());
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentWithVisualization.document());
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
} }
@ -166,7 +177,7 @@ public class LayoutParsingPipeline {
return LayoutParsingFinishedEvent.builder() return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier()) .identifier(layoutParsingRequest.identifier())
.numberOfPages(documentGraph.getNumberOfPages()) .numberOfPages(documentWithVisualization.document().getNumberOfPages())
.duration(System.currentTimeMillis() - start) .duration(System.currentTimeMillis() - start)
.message(format(""" .message(format("""
Layout parsing has finished in %.02f s. Layout parsing has finished in %.02f s.
@ -181,21 +192,22 @@ public class LayoutParsingPipeline {
Viewer Doc: %s""", Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000, ((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(), layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(), layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(), layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(), layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(), layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(), layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId())) layoutParsingRequest.viewerDocumentStorageId()))
.layoutParserVersion(layoutParserVersion)
.build(); .build();
} }
private Document observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) { private DocumentWithVisualization observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
AtomicReference<Document> documentReference = new AtomicReference<>(); AtomicReference<DocumentWithVisualization> documentReference = new AtomicReference<>();
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry) Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
.contextualName("build-document-graph") .contextualName("build-document-graph")
@ -345,14 +357,14 @@ public class LayoutParsingPipeline {
classificationService.classify(classificationDocument, layoutParsingType, identifier); classificationService.classify(classificationDocument, layoutParsingType, identifier);
TableOfContents tableOfContents = outlineValidationService.createToC(classificationDocument); SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
classificationDocument.setTableOfContents(tableOfContents); classificationDocument.setSectionTree(sectionTree);
log.info("Building Sections for {}", identifier); log.info("Building Sections for {}", identifier);
switch (layoutParsingType) { switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument); case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument); default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
} }
return classificationDocument; return classificationDocument;
@ -385,7 +397,8 @@ public class LayoutParsingPipeline {
.flatMap(Collection::stream) .flatMap(Collection::stream)
.map(Character::getTextPosition) .map(Character::getTextPosition)
.filter(pos -> pos.getDir().equals(dir)) .filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0); .mapToDouble(RedTextPosition::getExactDir).average()
.orElse(0);
if (averageRotation == 0) { if (averageRotation == 0) {
continue; continue;

View File

@ -17,8 +17,8 @@ import org.springframework.core.task.TaskExecutor;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
@ -122,7 +122,7 @@ public class LayoutParsingStorageService {
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(), Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(), layoutParsingRequest.positionBlockFileStorageId(),
documentData.getDocumentPositions()); documentData.getDocumentPositionData());
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor); CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);

View File

@ -24,7 +24,7 @@ import lombok.EqualsAndHashCode;
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false) @EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class Line extends TextBoundingBox { public class Line extends TextBoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.18; private static final double WORD_DISTANCE_MULTIPLIER = 0.17;
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
private final double x0; private final double x0;
@ -157,6 +157,9 @@ public class Line extends TextBoundingBox {
private void computeWords(List<Character> characters, double wordSpacing) { private void computeWords(List<Character> characters, double wordSpacing) {
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
Word word = new Word(); Word word = new Word();
Character previous = null; Character previous = null;
for (Character current : characters) { for (Character current : characters) {

View File

@ -4,7 +4,7 @@ import java.util.HashSet;
import java.util.Set; import java.util.Set;
import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine; import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;

View File

@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
@ -31,6 +31,6 @@ public class ClassificationDocument {
private long rulesVersion; private long rulesVersion;
private OutlineObjectTree outlineObjectTree; private OutlineObjectTree outlineObjectTree;
private TableOfContents tableOfContents; private SectionTree sectionTree;
} }

View File

@ -0,0 +1,19 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.Map;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
public record DocumentWithVisualization(Document document, LayoutDebugLayer layoutDebugLayer) {
public Map<NodeType, Long> buildSemanticNodeCounts() {
return document.streamAllSubNodes()
.collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
}
}

View File

@ -14,6 +14,7 @@ public enum PageBlockType {
PARAGRAPH_ITALIC, PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN, PARAGRAPH_UNKNOWN,
OTHER, OTHER,
TABLE_OF_CONTENTS_HEADLINE,
TABLE_OF_CONTENTS_ITEM, TABLE_OF_CONTENTS_ITEM,
LIST_ITEM, LIST_ITEM,
TABLE; TABLE;
@ -35,7 +36,7 @@ public enum PageBlockType {
public static int getHeadlineNumber(PageBlockType pageBlockType) { public static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) { return switch (pageBlockType) {
case H1 -> 1; case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
case H2 -> 2; case H2 -> 2;
case H3 -> 3; case H3 -> 3;
case H4 -> 4; case H4 -> 4;
@ -47,6 +48,6 @@ public enum PageBlockType {
public boolean isHeadline() { public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6); return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
} }
} }

View File

@ -16,7 +16,7 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier { public class SectionIdentifier {
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?"); public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?");
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?"); public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
public enum Format { public enum Format {

View File

@ -1,94 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
public abstract class AbstractNodeVisitor implements NodeVisitor {
@Override
public void visit(Document document) {
visitChildren(document);
}
@Override
public void visit(SuperSection superSection) {
visitChildren(superSection);
}
@Override
public void visit(Section section) {
visitChildren(section);
}
@Override
public void visit(Headline headline) {
visitChildren(headline);
}
@Override
public void visit(Paragraph paragraph) {
visitChildren(paragraph);
}
@Override
public void visit(Footer footer) {
visitChildren(footer);
}
@Override
public void visit(Header header) {
visitChildren(header);
}
@Override
public void visit(Image image) {
visitChildren(image);
}
@Override
public void visit(Table table) {
visitChildren(table);
}
@Override
public void visit(TableCell tableCell) {
visitChildren(tableCell);
}
protected void visitChildren(SemanticNode semanticNode) {
semanticNode.streamChildren()
.forEach(node -> node.accept(this));
}
}

View File

@ -1,230 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import static java.lang.String.format;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@EqualsAndHashCode
public class DocumentTree {
private final Entry root;
public DocumentTree(Document document) {
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
}
public TextBlock buildTextBlock() {
return allEntriesInOrder().map(Entry::getNode)
.filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
if (!entryExists(parentId)) {
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children.size());
parent.children.add(Entry.builder().treeId(newId).node(node).build());
return newId;
}
private boolean entryExists(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root != null;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children.get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> treeId) {
return getEntryById(getParentId(treeId));
}
public boolean hasParentById(List<Integer> treeId) {
return !treeId.isEmpty();
}
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
return getEntryById(treeId).children.stream()
.map(Entry::getNode);
}
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
return getEntryById(treeId).children.stream()
.filter(entry -> entry.node.getType().equals(nodeType))
.map(Entry::getNode);
}
private static List<Integer> getParentId(List<Integer> treeId) {
if (treeId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (treeId.size() < 2) {
return Collections.emptyList();
}
return treeId.subList(0, treeId.size() - 1);
}
public Entry getEntryById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root;
for (int id : treeId) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<Entry> mainEntries() {
return root.children.stream();
}
public Stream<Entry> allEntriesInOrder() {
return Stream.of(root)
.flatMap(DocumentTree::flatten);
}
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).children.stream()
.flatMap(DocumentTree::flatten);
}
@Override
public String toString() {
return String.join("\n",
allEntriesInOrder().map(Entry::toString)
.toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry),
entry.children.stream()
.flatMap(DocumentTree::flatten));
}
public SemanticNode getHighestParentById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root.node;
}
return root.children.get(treeId.get(0)).node;
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public static class Entry {
List<Integer> treeId;
SemanticNode node;
@Builder.Default
List<Entry> children = new LinkedList<>();
@Override
public String toString() {
return node.toString();
}
public NodeType getType() {
return node.getType();
}
}
}

View File

@ -1,45 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
public interface NodeVisitor {
void visit(Document document);
void visit(SuperSection superSection);
void visit(Section section);
void visit(Headline headline);
void visit(Paragraph paragraph);
void visit(Footer footer);
void visit(Header header);
void visit(Image image);
void visit(Table table);
void visit(TableCell tableCell);
}

View File

@ -1,166 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import static java.lang.String.format;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import lombok.EqualsAndHashCode;
import lombok.Setter;
@Setter
@EqualsAndHashCode
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
public class TextRange implements Comparable<TextRange> {
private int start;
private int end;
public TextRange(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
this.start = start;
this.end = end;
}
public int length() {
return end - start;
}
public int start() {
return start;
}
public int end() {
return end;
}
public boolean contains(TextRange textRange) {
return start <= textRange.start() && textRange.end() <= end;
}
public boolean containedBy(TextRange textRange) {
return textRange.contains(this);
}
public boolean contains(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return this.start <= start && end <= this.end;
}
public boolean containedBy(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return start <= this.start && this.end <= end;
}
public boolean contains(int index) {
return start <= index && index <= end;
}
public boolean containsExclusive(int index) {
return start <= index && index < end;
}
public boolean intersects(TextRange textRange) {
return textRange.start() < this.end && this.start < textRange.end();
}
public List<TextRange> split(List<Integer> splitIndices) {
if (splitIndices.stream()
.anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
splitIndices.stream()
.filter(idx -> !this.contains(idx))
.toList(),
this));
}
List<TextRange> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int splitIndex : splitIndices) {
// skip split if it would produce a boundary of length 0
if (splitIndex == previousIndex) {
continue;
}
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
previousIndex = splitIndex;
}
if (previousIndex != end) {
splitBoundaries.add(new TextRange(previousIndex, end));
}
return splitBoundaries;
}
public IntStream intStream() {
return IntStream.range(start, end);
}
public static TextRange merge(Collection<TextRange> boundaries) {
int minStart = boundaries.stream()
.mapToInt(TextRange::start)
.min()
.orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream()
.mapToInt(TextRange::end)
.max()
.orElseThrow(IllegalArgumentException::new);
return new TextRange(minStart, maxEnd);
}
@Override
public String toString() {
return format("Boundary [%d|%d)", start, end);
}
@Override
public int compareTo(TextRange textRange) {
if (end < textRange.end() && start < textRange.start()) {
return -1;
}
if (start > textRange.start() && end > textRange.end()) {
return 1;
}
return 0;
}
}

View File

@ -1,8 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
public enum EntityType {
ENTITY,
RECOMMENDATION,
FALSE_POSITIVE,
FALSE_RECOMMENDATION
}

View File

@ -1,24 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class RedactionPosition {
final String id;
Page page;
// Each entry in this list corresponds to an entry in the redaction log, this means:
// An entity might be represented by multiple redaction log entries
List<Rectangle2D> rectanglePerLine;
}

View File

@ -1,228 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TextEntity {
// initial values
@EqualsAndHashCode.Include
final TextRange textRange;
@EqualsAndHashCode.Include
final String type;
@EqualsAndHashCode.Include
final EntityType entityType;
// empty defaults
boolean redaction;
boolean removed;
boolean ignored;
boolean resized;
boolean skipRemoveEntitiesContainedInLarger;
boolean dictionaryEntry;
boolean dossierDictionaryEntry;
Set<Engine> engines;
Set<TextEntity> references;
@Builder.Default
Deque<Integer> matchedRules = new LinkedList<>();
String redactionReason;
String legalBasis;
// inferred on graph insertion
@EqualsAndHashCode.Include
String value;
String textBefore;
String textAfter;
@Builder.Default
Set<Page> pages = new HashSet<>();
List<RedactionPosition> redactionPositionsPerPage;
@Builder.Default
List<SemanticNode> intersectingNodes = new LinkedList<>();
SemanticNode deepestFullyContainingNode;
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType) {
return TextEntity.builder().type(type).entityType(entityType).textRange(textRange).engines(new HashSet<>()).references(new HashSet<>()).build();
}
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
return intersectingNodes.stream().anyMatch(clazz::isInstance);
}
public boolean occursInNode(SemanticNode semanticNode) {
return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode));
}
public boolean isType(String type) {
return this.type.equals(type);
}
public boolean isAnyType(List<String> types) {
return types.contains(type);
}
public void addIntersectingNode(SemanticNode containingNode) {
intersectingNodes.add(containingNode);
}
public void removeFromGraph() {
intersectingNodes.forEach(node -> node.getEntities().remove(this));
pages.forEach(page -> page.getEntities().remove(this));
intersectingNodes = new LinkedList<>();
deepestFullyContainingNode = null;
pages = new HashSet<>();
removed = true;
ignored = true;
}
public void addMatchedRule(int ruleNumber) {
matchedRules.add(ruleNumber);
}
public int getMatchedRule() {
if (matchedRules.isEmpty()) {
return 0;
}
return matchedRules.getLast();
}
public List<RedactionPosition> getRedactionPositionsPerPage() {
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange);
Page firstPage = rectanglesPerLinePerPage.keySet()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
}
return redactionPositionsPerPage;
}
private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
if (entry.getKey().equals(firstPage)) {
return new RedactionPosition(id, entry.getKey(), entry.getValue());
} else {
return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
}
}
public boolean containedBy(TextEntity textEntity) {
return this.textRange.containedBy(textEntity.getTextRange());
}
public boolean contains(TextEntity textEntity) {
return this.textRange.contains(textEntity.getTextRange());
}
public boolean intersects(TextEntity textEntity) {
return this.textRange.intersects(textEntity.getTextRange());
}
public void addEngine(Engine engine) {
engines.add(engine);
}
public void addEngines(Set<Engine> engines) {
this.engines.addAll(engines);
}
public void addReference(TextEntity reference) {
references.add(reference);
}
public void addReferences(List<TextEntity> references) {
this.references.addAll(references);
}
public boolean matchesAnnotationId(String manualRedactionId) {
return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Entity[\"");
sb.append(value);
sb.append("\", ");
sb.append(textRange);
sb.append(", pages[");
pages.forEach(page -> {
sb.append(page.getNumber());
sb.append(", ");
});
sb.delete(sb.length() - 2, sb.length());
sb.append("], type = \"");
sb.append(type);
sb.append("\", EntityType.");
sb.append(entityType);
sb.append("]");
return sb.toString();
}
}

View File

@ -1,74 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public abstract class AbstractSemanticNode implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<TextEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + getType() + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,173 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Document extends AbstractSemanticNode {
Set<Page> pages;
Integer numberOfPages;
LayoutDebugLayer layoutDebugLayer;
@Override
public NodeType getType() {
return NodeType.DOCUMENT;
}
/**
* Gets the sections of the document as a list.
*
* @return A list of all sections within the document.
*/
public List<Section> getAllSections() {
return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the main sections of the document as a list.
*
* @return A list of main sections within the document
* @deprecated This method is marked for removal.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
*/
@Deprecated(forRemoval = true)
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects.
*
* @return A list of all children of type SECTION or SUPER_SECTION.
*/
public List<SemanticNode> getChildrenOfTypeSectionOrSuperSection() {
return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION))
.toList();
}
public List<Header> getHeaders() {
return streamChildrenOfType(NodeType.HEADER).map(node -> (Header) node)
.collect(Collectors.toList());
}
public List<Footer> getFooters() {
return streamChildrenOfType(NodeType.FOOTER).map(node -> (Footer) node)
.collect(Collectors.toList());
}
@Override
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst().orElse(Headline.builder().build());
}
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock);
}
@Override
public List<Integer> getTreeId() {
return Collections.emptyList();
}
@Override
public void setTreeId(List<Integer> tocId) {
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
}
private Stream<SemanticNode> streamAllNodes() {
return getDocumentTree().allEntriesInOrder()
.map(DocumentTree.Entry::getNode);
}
public Stream<Image> streamAllImages() {
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
}
public Map<NodeType, Long> buildSemanticNodeCounts() {
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
}
@Override
public String toString() {
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBox = new HashMap<>();
for (Page page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -1,28 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@EqualsAndHashCode(callSuper = true)
@SuperBuilder
public class DuplicatedParagraph extends Paragraph {
TextBlock unsortedLeafTextBlock;
@Override
public TextBlock getTextBlock() {
return Stream.of(super.getLeafTextBlock(), unsortedLeafTextBlock)
.collect(new TextBlockCollector());
}
}

View File

@ -1,57 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Footer extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.FOOTER;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -1,5 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
public interface GenericSemanticNode extends SemanticNode {
}

View File

@ -1,58 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Header extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public boolean isLeaf() {
return true;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public NodeType getType() {
return NodeType.HEADER;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -1,65 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Headline extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.HEADLINE;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
}
@Override
public Headline getHeadline() {
return this;
}
}

View File

@ -1,115 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Image extends AbstractSemanticNode {
String id;
String representationHash;
ImageType imageType;
boolean transparent;
Rectangle2D position;
TextBlock leafTextBlock;
boolean redaction;
boolean ignored;
@Builder.Default
String redactionReason = "";
@Builder.Default
String legalBasis = "";
@Builder.Default
int matchedRule = -1;
@EqualsAndHashCode.Exclude
Page page;
@Override
public NodeType getType() {
return NodeType.IMAGE;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public Set<Page> getPages() {
return Collections.singleton(page);
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
bBoxPerPage.put(page, position);
return bBoxPerPage;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public boolean isLeaf() {
return true;
}
public double getArea() {
return position.getWidth() * position.getHeight();
}
public boolean isFullPageImage() {
return imageType.equals(ImageType.OCR) || getArea() >= 0.5 * page.getArea();
}
}

View File

@ -1,26 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.Locale;
public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
SIGNATURE_VISUAL,
OTHER,
OCR,
GRAPHIC;
public static ImageType fromString(String imageType) {
return switch (imageType.toLowerCase(Locale.ROOT)) {
case "logo" -> ImageType.LOGO;
case "formula" -> ImageType.FORMULA;
case "signature" -> ImageType.SIGNATURE;
case "ocr" -> ImageType.OCR;
case "graphic" -> ImageType.GRAPHIC;
default -> ImageType.OTHER;
};
}
}

View File

@ -1,122 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@Setter
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Page {
Integer number;
Integer height;
Integer width;
Integer rotation;
@EqualsAndHashCode.Exclude
List<AtomicTextBlock> textBlocksOnPage;
@EqualsAndHashCode.Exclude
Header header;
@EqualsAndHashCode.Exclude
Footer footer;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<TextEntity> entities = new HashSet<>();
@Builder.Default
@EqualsAndHashCode.Exclude
Set<Image> images = new HashSet<>();
public static Page fromClassificationPage(ClassificationPage classificationPage) {
return Page.builder()
.height((int) classificationPage.getPageHeight())
.width((int) classificationPage.getPageWidth())
.number(classificationPage.getPageNumber())
.rotation(classificationPage.getRotation())
.textBlocksOnPage(new LinkedList<>())
.build();
}
/**
* Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body.
*
* @return The main body text block.
*/
public TextBlock getMainBodyTextBlock() {
return textBlocksOnPage.stream()
.filter(atb -> !atb.isEmpty())
.collect(new TextBlockCollector());
}
public List<SemanticNode> getMainBody() {
return textBlocksOnPage.stream()
.map(AtomicTextBlock::getParent)
.map(this::getHighestParentOnPage)
.distinct()
.toList();
}
private SemanticNode getHighestParentOnPage(SemanticNode node) {
SemanticNode currentNode = node;
while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) {
currentNode = currentNode.getParent();
}
return currentNode;
}
@Override
public String toString() {
return String.valueOf(number);
}
@Override
public int hashCode() {
return number;
}
@Override
public boolean equals(Object o) {
return o instanceof Page && o.hashCode() == this.hashCode();
}
public double getArea() {
return height * width;
}
}

View File

@ -1,51 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PROTECTED)
public class Paragraph extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.PARAGRAPH;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
}

View File

@ -1,54 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class Section extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.SECTION;
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst().orElseGet(() -> getParent().getHeadline());
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny().isPresent();
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -1,521 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.processor.utils.BBoxMergingUtility;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
public interface SemanticNode {
/**
* Returns the type of this node, such as Section, Paragraph, etc.
*
* @return NodeType of this node
*/
NodeType getType();
/**
* Searches all Nodes located underneath this Node in the DocumentTree and concatenates their AtomicTextBlocks into a single TextBlock.
* So, for a Section all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlock
* If the Node is a Leaf, the LeafTextBlock will be returned instead.
*
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
*/
default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock)
.collect(new TextBlockCollector());
}
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose boundary intersects the boundary of this node.
*
* @return Set of all Entities associated with this Node
*/
Set<TextEntity> getEntities();
/**
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages() {
return getTextBlock().getPages();
}
default Page getFirstPage() {
return getTextBlock().getPages()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
}
/**
* Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages(TextRange textRange) {
if (!getBoundary().contains(textRange)) {
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", textRange, getBoundary()));
}
return getTextBlock().getPages(textRange);
}
default boolean isOnPage(int pageNumber) {
return getPages().stream()
.anyMatch(page -> page.getNumber() == pageNumber);
}
/**
* Returns the DocumentTree Object.
*
* @return the DocumentTree of the Document this node belongs to
*/
DocumentTree getDocumentTree();
/**
* The id is a List of Integers uniquely identifying this node in the DocumentTree.
*
* @return the DocumentTree ID
*/
List<Integer> getTreeId();
/**
* This should only be used during graph construction.
*
* @param tocId List of Integers
*/
void setTreeId(List<Integer> tocId);
/**
* Traverses the Tree up, until it hits a Headline or hits a Section which will then return the first Headline from its children.
* Throws NotFoundException if no Headline is found this way
*
* @return First Headline found
*/
default Headline getHeadline() {
return getParent().getHeadline();
}
/**
* Checks if its TocId has a length greater than zero.
*
* @return boolean indicating whether this Node has a Parent in the DocumentTree
*/
default boolean hasParent() {
return getDocumentTree().hasParentById(getTreeId());
}
/**
* @return The SemanticNode representing the Parent in the DocumentTree
* throws NotFoundException, when no parent is present
*/
default SemanticNode getParent() {
return getDocumentTree().getParentEntryById(getTreeId()).getNode();
}
/**
* @return The SemanticNode which is directly underneath the document and also under which this node is.
* if this is the highest child node or the document itself, it returns itself.
*/
default SemanticNode getHighestParent() {
return getDocumentTree().getHighestParentById(getTreeId());
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections, Images, and Tables are not leaves.
* A TableCell might be a leaf depending on its area compared to the page.
*
* @return boolean, indicating if a Node has direct access to a TextBlock
*/
default boolean isLeaf() {
return false;
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections and Tables are no leaves.
*
* @return AtomicTextBlock
*/
default TextBlock getLeafTextBlock() {
throw new UnsupportedOperationException("Only leaf Nodes have access to LeafTextBlocks!");
}
/**
* Should only be used during construction of the Graph. Sets the LeafTextBlock of this SemanticNode.
*
* @param textBlock the TextBlock to set as the LeafTextBlock of this SemanticNode
*/
default void setLeafTextBlock(TextBlock textBlock) {
throw new UnsupportedOperationException();
}
/**
* Checks whether this SemanticNode has any Entity with EntityType.ENTITY of the provided type.
*
* @param type string representing the type of entity to check for
* @return true, if this SemanticNode has at least one Entity of the provided type
*/
default boolean hasEntitiesOfType(String type) {
return getEntities().stream()
.filter(entity -> entity.getEntityType().equals(EntityType.ENTITY))
.anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
}
/**
* Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author".
*
* @param type string representing the type of entities to return
* @return List of RedactionEntities of any the type
*/
default List<TextEntity> getEntitiesOfType(String type) {
return getEntities().stream()
.filter(redactionEntity -> redactionEntity.getType().equals(type))
.toList();
}
/**
* Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author".
*
* @param types A list of strings representing the types of entities to return
* @return List of RedactionEntities of any provided type
*/
default List<TextEntity> getEntitiesOfType(List<String> types) {
return getEntities().stream()
.filter(redactionEntity -> redactionEntity.isAnyType(types))
.toList();
}
/**
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
*
* @return Integer representing the number on the page
*/
default Integer getNumberOnPage() {
TextBlock textBlock = getTextBlock();
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
} else {
return -1;
}
}
/**
* Checks if the SemanticNode contains any text.
*
* @return true, if this node's TextBlock is not empty
*/
default boolean hasText() {
return !getTextBlock().isEmpty();
}
/**
* Checks whether this SemanticNode contains the provided String.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string
*/
default boolean containsString(String string) {
return getTextBlock().getSearchText().contains(string);
}
/**
* Checks whether this SemanticNode contains all the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all strings
*/
default boolean containsStrings(List<String> strings) {
return strings.stream()
.allMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains all the provided Strings ignoring case.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string ignoring case
*/
default boolean containsStringIgnoreCase(String string) {
return getTextBlock().getSearchText().toLowerCase(Locale.ROOT).contains(string.toLowerCase(Locale.ROOT));
}
/**
* Checks whether this SemanticNode contains any of the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyString(List<String> strings) {
return strings.stream()
.anyMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyStringIgnoreCase(List<String> strings) {
return strings.stream()
.anyMatch(this::containsStringIgnoreCase);
}
/**
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity.
* It sets the fields accordingly and recursively calls this function on all its children.
*
* @param textEntity RedactionEntity, which is being inserted into the graph
*/
default void addThisToEntityIfIntersects(TextEntity textEntity) {
TextBlock textBlock = getTextBlock();
if (textBlock.getTextRange().intersects(textEntity.getTextRange())) {
if (textBlock.containsBoundary(textEntity.getTextRange())) {
textEntity.setDeepestFullyContainingNode(this);
}
textEntity.addIntersectingNode(this);
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(textEntity.getTextRange()))
.forEach(node -> node.addThisToEntityIfIntersects(textEntity));
}
}
/**
* returns the set of layoutengines.
*
* @return set of layoutengines.
*/
Set<LayoutEngine> getEngines();
/**
* adds a layoutengine to the set.
*/
default void addEngine(LayoutEngine engine) {
getEngines().add(engine);
}
/**
* Streams all children located directly underneath this node in the DocumentTree.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildren() {
return getDocumentTree().childNodes(getTreeId());
}
/**
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildrenOfType(NodeType nodeType) {
return getDocumentTree().childNodesOfType(getTreeId(), nodeType);
}
/**
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getDocumentTree().allSubEntriesInOrder(getTreeId())
.map(DocumentTree.Entry::getNode);
}
/**
* Recursively streams all SemanticNodes of the provided type located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
return getDocumentTree().allSubEntriesInOrder(getTreeId())
.filter(entry -> entry.getType().equals(nodeType))
.map(DocumentTree.Entry::getNode);
}
/**
* The Boundary is the start and end string offsets in the reading order of the document.
*
* @return Boundary of this Node's TextBlock
*/
default TextRange getBoundary() {
return getTextBlock().getTextRange();
}
/**
* If this Node is a Leaf it will calculate the boundingBox of its LeafTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
* If called on the Document, it will return the cropbox of each page
*
* @return Rectangle2D fully encapsulating this Node for each page.
*/
default Map<Page, Rectangle2D> getBBox() {
if (isLeaf()) {
return getBBoxFromLeafTextBlock();
}
return getBBoxFromChildren();
}
/**
* Checks whether the Bounding Box of this SemanticNode contains the provided rectangle on the provided page.
*
* @param rectangle2D The rectangle to check if it is contained
* @param pageNumber The Page number on which the rectangle should be checked
* @return boolean
*/
default boolean containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) {
Page helperPage = Page.builder().number(pageNumber).build();
if (!getPages().contains(helperPage)) {
return false;
}
return getBBox().get(helperPage).contains(rectangle2D);
}
/**
* TODO: this produces unwanted results for sections spanning multiple columns.
* Computes the Union of the bounding boxes of all children recursively.
*
* @return The union of the BoundingBoxes of all children
*/
private Map<Page, Rectangle2D> getBBoxFromChildren() {
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().filter(child -> !isFullPageImage(child))
.map(SemanticNode::getBBox)
.toList();
return BBoxMergingUtility.mergeBBoxes(childrenBBoxes);
}
private static boolean isFullPageImage(SemanticNode child) {
if (!child.getType().equals(NodeType.IMAGE)) {
return false;
}
return ((Image) child).isFullPageImage();
}
/**
* @return The union of all BoundingBoxes of the TextBlock of this node
*/
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks()
.stream()
.collect(Collectors.groupingBy(AtomicTextBlock::getPage));
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
return bBoxPerPage;
}
void accept(NodeVisitor visitor);
/**
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
*
* @param page the page to check
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
*/
default boolean onlyOnPage(Page page) {
Set<Page> pages = getPages();
return pages.size() == 1 && pages.contains(page);
}
}

View File

@ -1,47 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class SuperSection extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.SUPER_SECTION;
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst().orElseGet(() -> getParent().getHeadline());
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.SUPER_SECTION + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -1,363 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Table implements SemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
DocumentTree documentTree;
int numberOfRows;
int numberOfCols;
TextBlock textBlock;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<TextEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
/**
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
*
* @param strings Strings to check whether a row contains them
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
*/
public Stream<TextEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
return IntStream.range(0, numberOfRows).boxed()
.filter(row -> rowContainsStringsIgnoreCase(row, strings))
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Checks whether the specified row contains all the provided strings.
*
* @param row the row to check as an Integer, must be smaller than numberOfRows
* @param strings a list of strings to check for
* @return true, if all strings appear in the provided row
*/
public boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings) {
String rowText = streamRow(row).map(TableCell::getTextBlock)
.collect(new TextBlockCollector()).getSearchText().toLowerCase(Locale.ROOT);
return strings.stream()
.map(String::toLowerCase)
.allMatch(rowText::contains);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
*
* @param header the header value to search for
* @param value the string which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
*/
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
List<Integer> vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> vertebrateStudyCols.stream()
.anyMatch(vertebrateStudyCol -> getCell(tableCellNode.getRow(), vertebrateStudyCol).containsString(value)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
*
* @param header the header value to search for
* @param values the strings which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
*/
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
List<Integer> colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> colsWithHeader.stream()
.anyMatch(colWithHeader -> getCell(tableCellNode.getRow(), colWithHeader).containsAnyString(values)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*
* @param types type strings to check whether a row contains an entity like them
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*/
public Stream<TextEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types) {
List<Integer> rowsWithEntityOfType = IntStream.range(0, numberOfRows).boxed()
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).anyMatch(existingType -> types.stream()
.anyMatch(typeToCheck -> typeToCheck.equals(existingType))))
.toList();
return rowsWithEntityOfType.stream()
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
*
* @param types type strings to check whether a row doesn't contain an entity like it
* @return Stream of all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
*/
public Stream<TextEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types) {
List<Integer> rowsWithNoEntityOfType = IntStream.range(0, numberOfRows).boxed()
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).noneMatch(existingType -> types.stream()
.anyMatch(typeToCheck -> typeToCheck.equals(existingType))))
.toList();
return rowsWithNoEntityOfType.stream()
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
private Stream<String> streamEntityTypesInRow(Integer rowNumber) {
return streamRow(rowNumber).map(TableCell::getEntities)
.flatMap(Collection::stream)
.map(TextEntity::getType)
.distinct();
}
/**
* Returns a TableCell at the provided row and column location.
*
* @param row int representing the row, must be smaller than numberOfRows
* @param col int representing the col, must be smaller than numberOfCols
* @return TableCell at the provided location in the table
*/
public TableCell getCell(int row, int col) {
if (numberOfRows - row < 0 || numberOfCols - col < 0) {
throw new IllegalArgumentException(format("row %d, col %d is out of bounds for number of rows of %d and number of cols %d", row, col, numberOfRows, numberOfCols));
}
int idx = row * numberOfCols + col;
return (TableCell) documentTree.getEntryById(treeId).getChildren()
.get(idx).getNode();
}
/**
* Streams all TableCells in this Table row-wise.
*
* @return Stream of all TableCells
*/
public Stream<TableCell> streamTableCells() {
return streamChildrenOfType(NodeType.TABLE_CELL).map(node -> (TableCell) node);
}
/**
* Streams all TableCells in this Table which have the provided header row-wise.
*
* @return Stream of all TableCells which have the provided header
*/
public Stream<TableCell> streamTableCellsWithHeader(String header) {
return streamHeaders().filter(tableCellNode -> tableCellNode.getTextBlock().getSearchText().contains(header))
.map(TableCell::getCol)
.flatMap(this::streamCol)
.filter(tableCellNode -> !tableCellNode.isHeader());
}
/**
* Streams all TableCells belonging to the provided column from top down.
*
* @param col int representing the column
* @return Stream of all TableCell in the provided column
*/
public Stream<TableCell> streamCol(int col) {
return IntStream.range(0, numberOfRows).boxed()
.map(row -> getCell(row, col));
}
/**
* Streams all TableCells belonging to the provided row from left to right.
*
* @param row int representing the row
* @return Stream of all TableCell in the provided row
*/
public Stream<TableCell> streamRow(int row) {
return IntStream.range(0, numberOfCols).boxed()
.map(col -> getCell(row, col));
}
/**
* Streams all TableCells row-wise and filters them with header == true.
*
* @return Stream of all TableCells with header == true
*/
public Stream<TableCell> streamHeaders() {
return streamTableCells().filter(TableCell::isHeader);
}
/**
* Streams all TableCells of the provided row and column and filters them with header == true.
*
* @param row int representing the row
* @param col int representing the column
* @return Stream of all TableCells with header == true in the provided row or col
*/
public Stream<TableCell> streamHeadersForCell(int row, int col) {
return Stream.concat(streamRow(row), streamCol(col))
.filter(TableCell::isHeader);
}
/**
* Streams all Headers and checks if any equal the provided string.
*
* @param header string to check the headers for
* @return true, if at least one header equals the provided string
*/
public boolean hasHeader(String header) {
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock().getSearchText().strip().equals(header));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
*
* @param header string to find header cells
* @param value string to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
*/
public boolean hasRowWithHeaderAndValue(String header, String value) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsString(value));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*
* @param header string to find header cells
* @param values List of strings to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*/
public boolean hasRowWithHeaderAndAnyValue(String header, List<String> values) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values));
}
/**
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
*
* @param type the type of entities to search for
* @param textEntity the entity, which appears in the row to search
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
*/
public List<TextEntity> getEntitiesOfTypeInSameRow(String type, TextEntity textEntity) {
return textEntity.getIntersectingNodes()
.stream()
.filter(node -> node instanceof TableCell)
.map(node -> (TableCell) node)
.flatMap(tableCellNode -> streamRow(tableCellNode.getRow()))
.map(cell -> cell.getEntitiesOfType(type))
.flatMap(Collection::stream)
.toList();
}
@Override
public NodeType getType() {
return NodeType.TABLE;
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = SemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = SemanticNode.super.getBBox();
}
return bBoxCache;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -1,95 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableCell extends AbstractSemanticNode {
int row;
int col;
boolean header;
Rectangle2D bBox;
TextBlock leafTextBlock;
TextBlock textBlock;
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
return bBoxPerPage;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public NodeType getType() {
return NodeType.TABLE_CELL;
}
@Override
public boolean isLeaf() {
return getDocumentTree().getEntryById(getTreeId()).getChildren().isEmpty();
}
@Override
public TextBlock getTextBlock() {
if (isLeaf()) {
return leafTextBlock;
}
if (textBlock == null) {
textBlock = buildTextBlock();
}
return textBlock;
}
private TextBlock buildTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -1,282 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.DocumentTextData;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class AtomicTextBlock implements TextBlock {
Long id;
Integer numberOnPage;
Page page;
//string coordinates
TextRange textRange;
String searchText;
@Builder.Default
List<Integer> lineBreaks = new ArrayList<>();
@Builder.Default
List<TextRange> boldTextBoundaries = new ArrayList<>();
@Builder.Default
List<TextRange> italicTextBoundaries = new ArrayList<>();
String orientation;
int textDirection;
//position coordinates
@Builder.Default
List<Integer> stringIdxToPositionIdx = new ArrayList<>();
@Builder.Default
List<Rectangle2D> positions = new ArrayList<>();
@EqualsAndHashCode.Exclude
SemanticNode parent;
@Override
public int numberOfLines() {
return lineBreaks.size() + 1;
}
@Override
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
return "";
}
Set<Integer> lbInBoundary = lineBreaks.stream()
.map(i -> i + this.textRange.start())
.filter(stringTextRange::contains)
.collect(Collectors.toSet());
if (stringTextRange.end() == getTextRange().end()) {
lbInBoundary.add(getTextRange().end());
}
StringBuilder sb = new StringBuilder();
for (int i = stringTextRange.start(); i < stringTextRange.end(); i++) {
char character = this.charAt(i);
if (lbInBoundary.contains(i + 1)) {
// always plus one, due to the linebreaks being an exclusive end index
if (!Character.isWhitespace(character)) {
lbInBoundary.remove(i + 1);
lbInBoundary.add(i + 2);
sb.append(character);
continue;
}
sb.append("\n");
} else {
sb.append(character);
}
}
return sb.toString();
}
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
List<Integer> lineBreaks,
List<TextRange> boldTextBoundaries,
List<TextRange> italicTextBoundaries,
List<Rectangle2D> positions,
List<Integer> stringIdxToPositionIdx,
long idx,
SemanticNode parent,
int numberOnPage,
Page page,
int offset,
String orientation,
int textDirection) {
return AtomicTextBlock.builder()
.id(idx)
.parent(parent)
.searchText(searchText)
.numberOnPage(numberOnPage)
.page(page)
.lineBreaks(lineBreaks)
.boldTextBoundaries(boldTextBoundaries)
.italicTextBoundaries(italicTextBoundaries)
.positions(positions)
.stringIdxToPositionIdx(stringIdxToPositionIdx)
.textRange(new TextRange(offset, offset + searchText.length()))
.textDirection(textDirection)
.orientation(orientation)
.build();
}
public static AtomicTextBlock empty(Long textBlockIdx, int stringOffset, Page page, int numberOnPage, SemanticNode parent) {
return AtomicTextBlock.builder()
.id(textBlockIdx)
.textRange(new TextRange(stringOffset, stringOffset))
.searchText("")
.page(page)
.numberOnPage(numberOnPage)
.parent(parent)
.build();
}
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {
return AtomicTextBlock.builder()
.id(documentTextData.getId())
.numberOnPage(documentTextData.getNumberOnPage())
.page(page)
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
.searchText(documentTextData.getSearchText())
.lineBreaks(documentTextData.getLineBreaksList())
.stringIdxToPositionIdx(documentPositionData.getStringIdxToPositionIdxList())
.positions(toRectangle2DList(documentPositionData.getPositionsList()))
.parent(parent)
.build();
}
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
return Arrays.stream(positions)
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
.toList();
}
private static List<Rectangle2D> toRectangle2DList(List<Position> positions) {
return positions.stream()
.map(pos -> (Rectangle2D) new Rectangle2D.Float(pos.getValue(0), pos.getValue(1), pos.getValue(2), pos.getValue(3)))
.toList();
}
public CharSequence getLine(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
}
if (lineNumber == 0) {
if (lineBreaks.isEmpty()) {
return searchText;
}
return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start());
} else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
}
return subSequence(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start());
}
@Override
public List<AtomicTextBlock> getAtomicTextBlocks() {
return List.of(this);
}
@Override
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
.findFirst() //
.orElse(searchText.length()) + textRange.start();
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
.reduce((a, b) -> b)//
.orElse(0) + textRange.start();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start()));
}
@Override
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
if (!containsBoundary(stringTextRange)) {
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange));
}
if (stringTextRange.length() == 0) {
return Collections.emptyList();
}
int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start());
if (stringTextRange.end() == this.textRange.end()) {
return positions.subList(startPositionIdx, positions.size());
}
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start()));
}
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
List<Rectangle2D> rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange))
.stream()
.map(this::getPositions)
.map(RectangleTransformations::rectangleBBoxWithGaps)
.flatMap(Collection::stream)
.toList();
Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>();
rectanglePerLinePerPage.put(page, rectanglesPerLine);
return rectanglePerLinePerPage;
}
protected List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
return getLineBreaks().stream()
.map(linebreak -> linebreak + this.textRange.start())
.filter(textRange::contains)
.toList();
}
@Override
public String toString() {
return searchText;
}
}

View File

@ -1,271 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ConcatenatedTextBlock implements TextBlock {
List<AtomicTextBlock> atomicTextBlocks;
String searchText;
TextRange textRange;
public static ConcatenatedTextBlock empty() {
return new ConcatenatedTextBlock(Collections.emptyList());
}
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
this.atomicTextBlocks = new LinkedList<>();
if (atomicTextBlocks.isEmpty()) {
textRange = new TextRange(-1, -1);
return;
}
var firstTextBlock = atomicTextBlocks.get(0);
this.atomicTextBlocks.add(firstTextBlock);
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
atomicTextBlocks.subList(1, atomicTextBlocks.size())
.forEach(this::concat);
}
public ConcatenatedTextBlock concat(TextBlock textBlock) {
int start = textBlock.getTextRange().start();
int end = textBlock.getTextRange().end();
if (this.atomicTextBlocks.isEmpty()) {
textRange.setStart(start);
textRange.setEnd(end);
} else if (textRange.end() != start) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
textRange.setEnd(end);
this.searchText = null;
return this;
}
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
return atomicTextBlocks.stream()
.filter(textBlock -> textBlock.getTextRange().containsExclusive(stringIdx))
.findAny()
.orElseThrow(IndexOutOfBoundsException::new);
}
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
return atomicTextBlocks.stream()
.filter(tb -> tb.getTextRange().intersects(textRange))
.toList();
}
@Override
public String getSearchText() {
if (searchText == null) {
StringBuilder sb = new StringBuilder();
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
searchText = sb.toString();
}
return searchText;
}
@Override
public int numberOfLines() {
return atomicTextBlocks.stream()
.map(AtomicTextBlock::getLineBreaks)
.mapToInt(List::size).sum();
}
@Override
public int getNextLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
}
@Override
public List<Integer> getLineBreaks() {
return getAtomicTextBlocks().stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks()
.stream())
.toList();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
}
@Override
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositions(stringTextRange);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getPositions());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
return positions;
}
@Override
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositionsPerPage(stringTextRange);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange()));
}
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(),
stringTextRange.end())));
return rectanglesPerLinePerPage;
}
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode,
rectangles,
(l1, l2) -> Stream.concat(l1.stream(), l2.stream())
.toList()));
return mergedMap;
}
@Override
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
return "";
}
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.size() == 1) {
return textBlocks.get(0).subSequenceWithLineBreaks(stringTextRange);
}
StringBuilder sb = new StringBuilder();
AtomicTextBlock firstTextBlock = textBlocks.get(0);
sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
sb.append(textBlock.searchTextWithLineBreaks());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
return sb.toString();
}
@Override
public String toString() {
return getSearchText();
}
@Override
public List<TextRange> getBoldTextBoundaries() {
return getAtomicTextBlocks().stream()
.map(AtomicTextBlock::getBoldTextBoundaries)
.flatMap(Collection::stream)
.toList();
}
@Override
public List<TextRange> getItalicTextBoundaries() {
return getAtomicTextBlocks().stream()
.map(AtomicTextBlock::getItalicTextBoundaries)
.flatMap(Collection::stream)
.toList();
}
@Override
public String getOrientation() {
if (atomicTextBlocks.isEmpty()) {
return "";
}
return atomicTextBlocks.get(0).getOrientation();
}
@Override
public int getTextDirection() {
if (atomicTextBlocks.isEmpty()) {
return 0;
}
return atomicTextBlocks.get(0).getTextDirection();
}
}

View File

@ -1,158 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
public interface TextBlock extends CharSequence {
String getSearchText();
List<AtomicTextBlock> getAtomicTextBlocks();
List<TextRange> getBoldTextBoundaries();
List<TextRange> getItalicTextBoundaries();
String getOrientation();
int getTextDirection();
TextRange getTextRange();
int getNextLinebreak(int fromIndex);
int getPreviousLinebreak(int fromIndex);
List<Integer> getLineBreaks();
Rectangle2D getPosition(int stringIdx);
List<Rectangle2D> getPositions(TextRange stringTextRange);
Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange);
int numberOfLines();
String subSequenceWithLineBreaks(TextRange stringTextRange);
default String searchTextWithLineBreaks() {
return subSequenceWithLineBreaks(getTextRange());
}
default int indexOf(String searchTerm) {
return indexOf(searchTerm, getTextRange().start());
}
default Set<Page> getPages() {
return getAtomicTextBlocks().stream()
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
default Set<Page> getPages(TextRange textRange) {
return getAtomicTextBlocks().stream()
.filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange))
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
default int indexOf(String searchTerm, int startOffset) {
int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start());
if (start == -1) {
return -1;
}
return start + getTextRange().start();
}
default CharSequence getFirstLine() {
return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start()));
}
default boolean containsBoundary(TextRange textRange) {
if (textRange.end() < textRange.start()) {
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange));
}
return getTextRange().contains(textRange);
}
default boolean containsIndex(int stringIndex) {
return getTextRange().containsExclusive(stringIndex);
}
default CharSequence subSequence(TextRange textRange) {
return subSequence(textRange.start(), textRange.end());
}
default String buildSummary() {
String[] words = getSearchText().split(" ");
int bound = Math.min(words.length, 4);
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
return String.join(" ", list);
}
@Override
default CharSequence subSequence(int start, int end) {
return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start());
}
@Override
default int length() {
return getTextRange().length();
}
@Override
default char charAt(int index) {
return getSearchText().charAt(index - getTextRange().start());
}
}

View File

@ -1,49 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import lombok.NoArgsConstructor;
@NoArgsConstructor
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
@Override
public Supplier<ConcatenatedTextBlock> supplier() {
return ConcatenatedTextBlock::empty;
}
@Override
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
return ConcatenatedTextBlock::concat;
}
@Override
public BinaryOperator<ConcatenatedTextBlock> combiner() {
return ConcatenatedTextBlock::concat;
}
@Override
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
return a -> a;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
}
}

View File

@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.image;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;

View File

@ -14,12 +14,12 @@ import lombok.RequiredArgsConstructor;
@Data @Data
@RequiredArgsConstructor @RequiredArgsConstructor
public class TableOfContents implements Iterable<TableOfContentItem> { public class SectionTree implements Iterable<SectionTreeEntry> {
private List<TableOfContentItem> mainSections = new ArrayList<>(); private List<SectionTreeEntry> mainSections = new ArrayList<>();
public TableOfContents(List<TableOfContentItem> mainSections) { public SectionTree(List<SectionTreeEntry> mainSections) {
this.mainSections = mainSections; this.mainSections = mainSections;
} }
@ -28,36 +28,36 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
public List<TextPageBlock> getAllTextPageBlocks() { public List<TextPageBlock> getAllTextPageBlocks() {
List<TextPageBlock> allTextPageBlocks = new ArrayList<>(); List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
for (TableOfContentItem item : mainSections) { for (SectionTreeEntry item : mainSections) {
collectTextPageBlocks(item, allTextPageBlocks); collectTextPageBlocks(item, allTextPageBlocks);
} }
return allTextPageBlocks; return allTextPageBlocks;
} }
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) { private void collectTextPageBlocks(SectionTreeEntry item, List<TextPageBlock> textPageBlocks) {
textPageBlocks.add(item.getHeadline()); textPageBlocks.add(item.getHeadline());
for (TableOfContentItem child : item.getChildren()) { for (SectionTreeEntry child : item.getChildren()) {
collectTextPageBlocks(child, textPageBlocks); collectTextPageBlocks(child, textPageBlocks);
} }
} }
public List<TableOfContentItem> getAllTableOfContentItems() { public List<SectionTreeEntry> getAllTableOfContentItems() {
List<TableOfContentItem> allItems = new ArrayList<>(); List<SectionTreeEntry> allItems = new ArrayList<>();
for (TableOfContentItem item : mainSections) { for (SectionTreeEntry item : mainSections) {
collectTableOfContentItems(item, allItems); collectTableOfContentItems(item, allItems);
} }
return allItems; return allItems;
} }
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) { private void collectTableOfContentItems(SectionTreeEntry item, List<SectionTreeEntry> allItems) {
allItems.add(item); allItems.add(item);
for (TableOfContentItem child : item.getChildren()) { for (SectionTreeEntry child : item.getChildren()) {
collectTableOfContentItems(child, allItems); collectTableOfContentItems(child, allItems);
} }
} }
@ -65,7 +65,7 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
private boolean containsBlock(TextPageBlock block) { private boolean containsBlock(TextPageBlock block) {
for (TableOfContentItem existingItem : this.getMainSections()) { for (SectionTreeEntry existingItem : this.getMainSections()) {
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) { if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
return true; return true;
} }
@ -74,9 +74,9 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
} }
private boolean containsItem(TableOfContentItem tocItem) { private boolean containsItem(SectionTreeEntry tocItem) {
for (TableOfContentItem existingItem : this.getMainSections()) { for (SectionTreeEntry existingItem : this.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) { if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true; return true;
} }
@ -86,18 +86,18 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
@Override @Override
public @NonNull Iterator<TableOfContentItem> iterator() { public @NonNull Iterator<SectionTreeEntry> iterator() {
return new TableOfContentItemIterator(mainSections); return new SectionTreeEntryIterator(mainSections);
} }
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> { private static class SectionTreeEntryIterator implements Iterator<SectionTreeEntry> {
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>(); private final Stack<Iterator<SectionTreeEntry>> stack = new Stack<>();
TableOfContentItemIterator(List<TableOfContentItem> mainSections) { SectionTreeEntryIterator(List<SectionTreeEntry> mainSections) {
stack.push(mainSections.iterator()); stack.push(mainSections.iterator());
} }
@ -112,10 +112,10 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
@Override @Override
public TableOfContentItem next() { public SectionTreeEntry next() {
ensureStackTopIsCurrent(); ensureStackTopIsCurrent();
TableOfContentItem currentItem = stack.peek().next(); SectionTreeEntry currentItem = stack.peek().next();
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) { if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
stack.push(currentItem.getChildren() stack.push(currentItem.getChildren()
.iterator()); .iterator());

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline; package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber; import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList; import java.util.ArrayList;
@ -18,23 +19,23 @@ import lombok.extern.slf4j.Slf4j;
@Service @Service
@Slf4j @Slf4j
public class OutlineValidationService { public class SectionTreeBuilderService {
@Observed(name = "OutlineValidationService", contextualName = "create-toc") @Observed(name = "OutlineValidationService", contextualName = "create-toc")
public TableOfContents createToC(ClassificationDocument classificationDocument) { public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
List<TextPageBlock> headlines = extractHeadlines(classificationDocument); List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
List<TableOfContentItem> mainSections = new ArrayList<>(); List<SectionTreeEntry> mainSections = new ArrayList<>();
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>(); Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
TableOfContentItem last = null; SectionTreeEntry last = null;
TreeSet<Integer> depths = new TreeSet<>(); TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) { for (TextPageBlock current : headlines) {
int currentDepth = getHeadlineNumber(current.getClassification()); int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1); Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new TableOfContentItem(current); var tocItem = new SectionTreeEntry(current);
if (parentDepth == null) { if (parentDepth == null) {
mainSections.add(tocItem); mainSections.add(tocItem);
@ -44,14 +45,16 @@ public class OutlineValidationService {
} else { } else {
assert last != null; assert last != null;
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification()); int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
if (lastDepth < parentDepth) { // headline after toc should always start a main section
parentDepth = 1;
} else if (lastDepth < parentDepth) {
parentDepth = lastDepth; parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) { } else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification()); parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
} }
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth); SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem); parent.addChild(tocItem);
} }
@ -60,7 +63,8 @@ public class OutlineValidationService {
depths.add(currentDepth); depths.add(currentDepth);
} }
return new TableOfContents(mainSections); return new SectionTree(mainSections);
} }

View File

@ -23,28 +23,28 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@Service @Service
public class TOCEnrichmentService { public class SectionTreeEnhancementService {
public void assignSectionBlocksAndImages(ClassificationDocument document) { public void assignSectionBlocksAndImages(ClassificationDocument document) {
TableOfContents toc = document.getTableOfContents(); SectionTree toc = document.getSectionTree();
Iterator<TableOfContentItem> iterator = toc.iterator(); Iterator<SectionTreeEntry> iterator = toc.iterator();
TableOfContentItem currentTOCItem = null; SectionTreeEntry currentTOCItem = null;
if (iterator.hasNext()) { if (iterator.hasNext()) {
currentTOCItem = iterator.next(); currentTOCItem = iterator.next();
} }
List<AbstractPageBlock> startBlocks = new ArrayList<>(); List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>(); List<ClassifiedImage> startImages = new ArrayList<>();
TableOfContentItem currentSection = null; SectionTreeEntry currentSection = null;
boolean foundFirstHeadline = false; boolean foundFirstHeadline = false;
List<ClassificationHeader> headers = new ArrayList<>(); List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>(); List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null; TablePageBlock previousTable = null;
List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>(); List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
for (ClassificationPage page : document.getPages()) { for (ClassificationPage page : document.getPages()) {
List<TableOfContentItem> currentPageTOCItems = new ArrayList<>(); List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
List<TextPageBlock> header = new ArrayList<>(); List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>(); List<TextPageBlock> footer = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) { for (AbstractPageBlock current : page.getTextBlocks()) {
@ -101,7 +101,7 @@ public class TOCEnrichmentService {
Double xMax = null; Double xMax = null;
Double yMax = null; Double yMax = null;
for (TableOfContentItem tocItem : lastFoundTOCItems) { for (SectionTreeEntry tocItem : lastFoundTOCItems) {
var headline = tocItem.getHeadline(); var headline = tocItem.getHeadline();
if (headline.getPage() != page.getPageNumber()) { if (headline.getPage() != page.getPageNumber()) {
@ -169,10 +169,10 @@ public class TOCEnrichmentService {
} }
if (!startBlocks.isEmpty() || !startImages.isEmpty()) { if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
TableOfContentItem unassigned = new TableOfContentItem(null); SectionTreeEntry unassigned = new SectionTreeEntry(null);
unassigned.setSectionBlocks(startBlocks); unassigned.setSectionBlocks(startBlocks);
unassigned.setImages(startImages); unassigned.setImages(startImages);
document.getTableOfContents().getMainSections().add(0, unassigned); document.getSectionTree().getMainSections().add(0, unassigned);
} }
document.setHeaders(headers); document.setHeaders(headers);
document.setFooters(footers); document.setFooters(footers);

View File

@ -2,10 +2,12 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -14,12 +16,18 @@ import lombok.EqualsAndHashCode;
@Data @Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true) @EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TableOfContentItem { public class SectionTreeEntry {
public enum Type {
SECTION,
SUPER_SECTION,
TOC_SECTION
}
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
private TextPageBlock headline; private TextPageBlock headline;
private List<TableOfContentItem> children = new ArrayList<>(); private List<SectionTreeEntry> children = new ArrayList<>();
private TableOfContentItem parent; private SectionTreeEntry parent;
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>(); private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>(); private List<ClassifiedImage> images = new ArrayList<>();
@ -27,20 +35,32 @@ public class TableOfContentItem {
private GenericSemanticNode section; private GenericSemanticNode section;
public TableOfContentItem(TextPageBlock headline) { public SectionTreeEntry(TextPageBlock headline) {
this.headline = headline; this.headline = headline;
} }
public void addChild(TableOfContentItem tableOfContentItem) { public Type getType() {
children.add(tableOfContentItem); if (!Objects.isNull(headline) && headline.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_HEADLINE)) {
tableOfContentItem.setParent(this); return Type.TOC_SECTION;
}
if (children.isEmpty()) {
return Type.SECTION;
}
return Type.SUPER_SECTION;
} }
public TableOfContentItem getSiblingBefore() { public void addChild(SectionTreeEntry sectionTreeEntry) {
children.add(sectionTreeEntry);
sectionTreeEntry.setParent(this);
}
public SectionTreeEntry getSiblingBefore() {
if (parent != null) { if (parent != null) {
int index = parent.getChildren().indexOf(this); int index = parent.getChildren().indexOf(this);
@ -52,7 +72,7 @@ public class TableOfContentItem {
} }
public TableOfContentItem getSiblingAfter() { public SectionTreeEntry getSiblingAfter() {
if (parent != null) { if (parent != null) {
int index = parent.getChildren().indexOf(this); int index = parent.getChildren().indexOf(this);
@ -69,7 +89,7 @@ public class TableOfContentItem {
if (headline.equals(block)) { if (headline.equals(block)) {
return true; return true;
} }
for (TableOfContentItem child : children) { for (SectionTreeEntry child : children) {
if (child.contains(block)) { if (child.contains(block)) {
return true; return true;
} }
@ -78,12 +98,12 @@ public class TableOfContentItem {
} }
public boolean contains(TableOfContentItem tocItem) { public boolean contains(SectionTreeEntry tocItem) {
if (this.equals(tocItem)) { if (this.equals(tocItem)) {
return true; return true;
} }
for (TableOfContentItem child : children) { for (SectionTreeEntry child : children) {
if (child.contains(tocItem)) { if (child.contains(tocItem)) {
return true; return true;
} }

View File

@ -1,34 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.Comparator;
import java.util.HashMap;
public class TextPositionSequenceComparator implements Comparator<Word> {
private HashMap<Word, TextBlockOnPage> lookup;
public TextPositionSequenceComparator(HashMap<Word, TextBlockOnPage> lookup) {
this.lookup = lookup;
}
@Override
public int compare(Word number1, Word number2) {
int page1 = lookup.get(number1).page().getPageNumber();
int page2 = lookup.get(number2).page().getPageNumber();
if (page1 != page2) {
return Integer.compare(page1, page2);
}
if (number1.getY() != number2.getY()) {
return Double.compare(number1.getY(), number2.getY());
}
return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString()));
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.Comparator;
import java.util.HashMap;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
public class TocNumberComparator implements Comparator<NumberWord> {
private HashMap<NumberWord, TextBlockOnPage> lookup;
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
this.lookup = lookup;
}
@Override
public int compare(NumberWord number1, NumberWord number2) {
int page1 = lookup.get(number1).page().getPageNumber();
int page2 = lookup.get(number2).page().getPageNumber();
if (page1 != page2) {
return Integer.compare(page1, page2);
}
if (number1.word().getY() != number2.word().getY()) {
return Double.compare(number1.word().getY(), number2.word().getY());
}
return Integer.compare(number1.number(), number2.number());
}
}

View File

@ -9,9 +9,9 @@ import java.util.Map;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;

View File

@ -8,7 +8,7 @@ import java.util.Map;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingBox; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingBox;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
@ -79,7 +79,7 @@ public class VisualLayoutParsingAdapter {
ClassifiedImage signature = new ClassifiedImage(new Rectangle2D.Float(t.getBox().getX1(), ClassifiedImage signature = new ClassifiedImage(new Rectangle2D.Float(t.getBox().getX1(),
t.getBox().getY1(), t.getBox().getY1(),
t.getBox().getX2() - t.getBox().getX1(), t.getBox().getX2() - t.getBox().getX1(),
t.getBox().getY2() - t.getBox().getY1()), ImageType.SIGNATURE, true, false, false, pageNumber,""); t.getBox().getY2() - t.getBox().getY1()), ImageType.SIGNATURE, true, false, false, pageNumber, "");
signatures.add(signature); signatures.add(signature);
} }

View File

@ -6,10 +6,11 @@ import java.util.stream.Stream;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -23,12 +24,10 @@ public class SimplifiedSectionTextService {
.stream() .stream()
.map(this::toSimplifiedSectionText) .map(this::toSimplifiedSectionText)
.toList(); .toList();
List<SimplifiedSectionText> simplifiedHeadersList = document.getHeaders() List<SimplifiedSectionText> simplifiedHeadersList = document.streamAllSubNodesOfType(NodeType.HEADER)
.stream()
.map(this::toSimplifiedSectionText) .map(this::toSimplifiedSectionText)
.toList(); .toList();
List<SimplifiedSectionText> simplifiedFootersList = document.getFooters() List<SimplifiedSectionText> simplifiedFootersList = document.streamAllSubNodesOfType(NodeType.FOOTER)
.stream()
.map(this::toSimplifiedSectionText) .map(this::toSimplifiedSectionText)
.toList(); .toList();
List<SimplifiedSectionText> simplifiedText = Stream.of(simplifiedMainSectionsList, simplifiedHeadersList, simplifiedFootersList) List<SimplifiedSectionText> simplifiedText = Stream.of(simplifiedMainSectionsList, simplifiedHeadersList, simplifiedFootersList)
@ -41,12 +40,10 @@ public class SimplifiedSectionTextService {
.stream() .stream()
.map(this::getSectionNumber) .map(this::getSectionNumber)
.toList()) .toList())
.headerSectionNumbers(document.getHeaders() .headerSectionNumbers(document.streamAllSubNodesOfType(NodeType.HEADER)
.stream()
.map(this::getSectionNumber) .map(this::getSectionNumber)
.toList()) .toList())
.footerSectionNumbers(document.getFooters() .footerSectionNumbers(document.streamAllSubNodesOfType(NodeType.FOOTER)
.stream()
.map(this::getSectionNumber) .map(this::getSectionNumber)
.toList()) .toList())
.build(); .build();

View File

@ -10,7 +10,7 @@ import java.util.Locale;
import org.apache.commons.text.similarity.LevenshteinDistance; import org.apache.commons.text.similarity.LevenshteinDistance;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine; import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;

View File

@ -9,7 +9,7 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;

View File

@ -18,10 +18,14 @@ public class ClassificationPatterns {
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile( public static final Pattern TABLE_OR_FIGURE_HEADLINE_PATTERN = Pattern.compile(
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b", "^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE); Pattern.CASE_INSENSITIVE);
public static final Pattern TABLE_MID_SENTENCE_PATTERN = Pattern.compile(
"(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE);
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]"); public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
public static final Pattern NUMERIC = Pattern.compile("[0-9]+"); public static final Pattern NUMERIC = Pattern.compile("[0-9]+");

View File

@ -6,7 +6,8 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.clas
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_MID_SENTENCE_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_HEADLINE_PATTERN;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
@ -83,7 +84,8 @@ public class DocuMineClassificationService {
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString()); Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString()); Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString()); Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString()); Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_HEADLINE_PATTERN.matcher(textBlock.toString());
Matcher tableMidSentenceMatcher = TABLE_MID_SENTENCE_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString()); Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
boolean isAtLeast3Characters = atLeast3Matcher.reset().find(); boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches(); boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
@ -148,6 +150,8 @@ public class DocuMineClassificationService {
&& greaterOrEqualFontThanPageAverage(textBlock, page)// && greaterOrEqualFontThanPageAverage(textBlock, page)//
&& PositionUtils.getApproxLineCount(textBlock) < 2.9// && PositionUtils.getApproxLineCount(textBlock) < 2.9//
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) // && (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
&& tableMidSentenceMatcher.reset().results()
.count() <= 1 //
&& !isAmount// && !isAmount//
&& !headlineWithSlashesMatches) { && !headlineWithSlashesMatches) {

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
public record NumberWord(Word word, int number) {
}

View File

@ -5,11 +5,11 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;

View File

@ -14,6 +14,7 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.regex.Matcher;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -23,10 +24,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator; import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
@ -59,7 +61,7 @@ public class TableOfContentsClassificationService {
if (end > i + 1) { if (end > i + 1) {
if (textBlock.textBlock().getClassification() == null) { if (textBlock.textBlock().getClassification() == null) {
textBlock.textBlock().setClassification(PageBlockType.H1); textBlock.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_HEADLINE);
} }
i = end; i = end;
} }
@ -69,11 +71,14 @@ public class TableOfContentsClassificationService {
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) { private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
if (start >= textBlocks.size()) {
return start;
}
ClassificationPage startPage = textBlocks.get(start).page(); ClassificationPage startPage = textBlocks.get(start).page();
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size())); List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>(); HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
List<Word> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size()); List<NumberWord> numbers = extractNumbers(initialLookAhead, numberToBlockLookup, document.getPages().size());
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup); TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, numberToBlockLookup);
int lastCandidate = start; int lastCandidate = start;
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) { for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
@ -93,28 +98,28 @@ public class TableOfContentsClassificationService {
break; break;
} }
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size()); List<NumberWord> numbersFromBlock = extractNumbers(textBlockOnPage, numberToBlockLookup, document.getPages().size());
List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster(); List<NumberWord> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
if (currentRightmostCluster.size() < MINIMUM_MATCHES) { if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
log.debug("No numbers indicating a table of contents here."); log.debug("No numbers indicating a table of contents here.");
return start; return start;
} }
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) { if (anyIntersection(currentRightmostCluster, numbersFromBlock, numberToBlockLookup)) {
lastCandidate = i; lastCandidate = i;
numbersFromBlock.forEach(tocNumberFinder::add); numbersFromBlock.forEach(tocNumberFinder::add);
} }
} }
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster() Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
.stream() .stream()
.map(lookup::get) .map(numberToBlockLookup::get)
.collect(Collectors.toSet()); .collect(Collectors.toSet());
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, numberToBlockLookup, blocksWithNumberInCluster, textBlocks.get(start - 1));
int lastConfirmed = start; int lastConfirmed = start;
for (int i = start; i < lastCandidate + 1; i++) { for (int i = start; i < lastCandidate + 1; i++) {
TextBlockOnPage textBlockOnPage = textBlocks.get(i); TextBlockOnPage textBlockOnPage = textBlocks.get(i);
@ -132,18 +137,22 @@ public class TableOfContentsClassificationService {
} }
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<Word, TextBlockOnPage> lookup) { private static void addVisualization(LayoutDebugLayer layoutDebugLayer,
TocNumberFinder tocNumberFinder,
Map<NumberWord, TextBlockOnPage> lookup,
Set<TextBlockOnPage> blocksWithNumberInCluster,
TextBlockOnPage startingHeadline) {
tocNumberFinder.getCurrentRightmostCluster() tocNumberFinder.getCurrentRightmostCluster()
.stream() .stream()
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber())) .collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber)); .forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
layoutDebugLayer.addTocBlocks(blocksWithNumberInCluster);
layoutDebugLayer.addTocBlocks(Set.of(startingHeadline));
} }
private static boolean anyIntersection(Collection<Word> numbers1, private static boolean anyIntersection(Collection<NumberWord> numbers1, Collection<NumberWord> numbers2, Map<NumberWord, TextBlockOnPage> lookup) {
Collection<Word> numbers2,
Map<Word, TextBlockOnPage> lookup) {
return numbers1.stream() return numbers1.stream()
.anyMatch(numberFromCluster -> numbers2.stream() .anyMatch(numberFromCluster -> numbers2.stream()
@ -151,9 +160,9 @@ public class TableOfContentsClassificationService {
} }
private static List<Word> extractNumbers(List<TextBlockOnPage> textBlocks, Map<Word, TextBlockOnPage> lookup, int numberOfPages) { private static List<NumberWord> extractNumbers(List<TextBlockOnPage> textBlocks, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
List<Word> blocks = new LinkedList<>(); List<NumberWord> blocks = new LinkedList<>();
for (TextBlockOnPage textBlock : textBlocks) { for (TextBlockOnPage textBlock : textBlocks) {
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages)); blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
} }
@ -161,30 +170,40 @@ public class TableOfContentsClassificationService {
} }
private static List<Word> extractNumbers(TextBlockOnPage textBlock, Map<Word, TextBlockOnPage> lookup, int numberOfPages) { private static List<NumberWord> extractNumbers(TextBlockOnPage textBlock, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
List<Word> blocks = new LinkedList<>(); List<NumberWord> blocks = new LinkedList<>();
TextPageBlock block = textBlock.textBlock(); TextPageBlock block = textBlock.textBlock();
List<Word> sequences = block.getWords(); List<Word> words = block.getWords();
for (int i = 0; i < sequences.size(); i++) { for (int i = 0; i < words.size(); i++) {
Word word = sequences.get(i); Word word = words.get(i);
if (!wordIsEndOfLine(i, words)) {
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
continue; continue;
} }
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) { if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) {
continue;
}
Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString());
if (matcher.find() && matcher.group(2) != null) {
continue;
}
Matcher numberFinder = NUMERIC.matcher(word);
if (!numberFinder.find() || word.length() > 5) {
continue; continue;
} }
try { try {
int pageNumber = Integer.parseInt(word.toString()); int pageNumber = Integer.parseInt(numberFinder.group());
if (0 >= pageNumber || pageNumber > numberOfPages) { if (0 >= pageNumber || pageNumber > numberOfPages) {
continue; continue;
} }
lookup.put(word, textBlock); NumberWord numberWord = new NumberWord(word, pageNumber);
blocks.add(word); lookup.put(numberWord, textBlock);
blocks.add(numberWord);
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand."); log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
} }
@ -193,6 +212,17 @@ public class TableOfContentsClassificationService {
} }
private static boolean wordIsEndOfLine(int i, List<Word> words) {
if (i == words.size() - 1) {
return true;
}
Word word = words.get(i);
Word nextWord = words.get(i + 1);
return !nextWord.rightOf(word);
}
private static CharSequence getSurroundingString(int i, List<Word> sequences) { private static CharSequence getSurroundingString(int i, List<Word> sequences) {
int end = Math.min(i + 5, sequences.size()); int end = Math.min(i + 5, sequences.size());
@ -203,13 +233,13 @@ public class TableOfContentsClassificationService {
} }
private static boolean matches(Word number1, Word number2, Map<Word, TextBlockOnPage> lookup) { private static boolean matches(NumberWord number1, NumberWord number2, Map<NumberWord, TextBlockOnPage> lookup) {
if (number1.getDir() != number2.getDir()) { if (number1.word().getDir() != number2.word().getDir()) {
return false; return false;
} }
return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE); return number1.word().intersectsXDirAdj(number2.word(), INTERSECTION_TOLERANCE);
} }
@ -247,11 +277,11 @@ public class TableOfContentsClassificationService {
private static class TocNumberFinder { private static class TocNumberFinder {
final UnionFind<Word> numberClusters; final UnionFind<NumberWord> numberClusters;
final HashMap<Word, TextBlockOnPage> lookup; final HashMap<NumberWord, TextBlockOnPage> lookup;
TocNumberFinder(List<Word> blocks, HashMap<Word, TextBlockOnPage> lookup) { TocNumberFinder(List<NumberWord> blocks, HashMap<NumberWord, TextBlockOnPage> lookup) {
this.numberClusters = new UnionFind<>(new HashSet<>(blocks)); this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
for (int i = 0; i < blocks.size(); i++) { for (int i = 0; i < blocks.size(); i++) {
@ -265,14 +295,14 @@ public class TableOfContentsClassificationService {
} }
public void add(Word number) { public void add(NumberWord number) {
if (numberClusters.getElements().contains(number)) { if (numberClusters.getElements().contains(number)) {
return; return;
} }
numberClusters.addElement(number); numberClusters.addElement(number);
for (Word element : numberClusters.getElements()) { for (NumberWord element : numberClusters.getElements()) {
if (matches(number, element, lookup)) { if (matches(number, element, lookup)) {
numberClusters.union(element, number); numberClusters.union(element, number);
} }
@ -280,73 +310,100 @@ public class TableOfContentsClassificationService {
} }
public List<Word> getCurrentRightmostCluster() { public List<NumberWord> getCurrentRightmostCluster() {
return numberClusters.getGroups() return numberClusters.getGroups()
.stream() .stream()
.filter(cluster -> cluster.size() > MINIMUM_MATCHES) .filter(cluster -> cluster.size() > MINIMUM_MATCHES)
.map(cluster -> cluster.stream() .map(cluster -> cluster.stream()
.sorted(new TextPositionSequenceComparator(lookup)) .sorted(new TocNumberComparator(lookup))
.toList()) .toList())
.map(this::removeOutliers) .map(this::removeOutliers)
// .map(this::filterByMinimumDensity) .map(this::removeOnNonConsecutivePages)
.map(this::filterByWordNearTopOfPage)
.filter(cluster -> cluster.size() > MINIMUM_MATCHES) .filter(cluster -> cluster.size() > MINIMUM_MATCHES)
.max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList()); .max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList());
} }
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
//
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
//
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
// clustersPerPage.keySet()
// .stream()
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
// .forEach(page -> {
// var numbersOnPage = clustersPerPage.get(page);
//
// double height = numbersOnPage.stream()
// .map(BoundingBox::getBBox)
// .collect(RectangleTransformations.collectBBox()).getHeight();
//
// double count = numbersOnPage.size();
//
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
// result.addAll(numbers);
// }
// });
// return result;
// }
private List<NumberWord> removeOnNonConsecutivePages(List<NumberWord> numbers) {
public List<Word> removeOutliers(List<Word> numbers) { List<NumberWord> result = new ArrayList<>();
List<Word> result = new ArrayList<>();
result.add(numbers.get(0)); result.add(numbers.get(0));
for (int i = 1; i < numbers.size(); i++) {
int prev = getPageNumber(numbers, i - 1);
int curr = getPageNumber(numbers, i);
if (Math.abs(prev - curr) > 1) {
break;
} else {
result.add(numbers.get(i));
}
}
return result;
}
private int getPageNumber(List<NumberWord> numbers, int i) {
return lookup.get(numbers.get(i)).page().getPageNumber();
}
private List<NumberWord> filterByWordNearTopOfPage(List<NumberWord> numbers) {
List<NumberWord> result = new ArrayList<>();
result.add(numbers.get(0));
for (int i = 1; i < numbers.size(); i++) {
NumberWord prev = numbers.get(i - 1);
NumberWord curr = numbers.get(i);
ClassificationPage prevPage = lookup.get(prev).page();
ClassificationPage currPage = lookup.get(curr).page();
if (prevPage.equals(currPage)) {
result.add(curr);
} else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) {
result.add(curr);
}
}
return result;
}
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
List<NumberWord> confirmedClusterNumbers = new ArrayList<>();
confirmedClusterNumbers.add(numbers.get(0));
for (int i = 1; i < numbers.size() - 1; i++) { for (int i = 1; i < numbers.size() - 1; i++) {
int prev = getNumberAsInt(numbers, i - 1); int prev = getNumberAsInt(numbers, i - 1);
int curr = getNumberAsInt(numbers, i); int curr = getNumberAsInt(numbers, i);
int next = getNumberAsInt(numbers, i + 1); int next = getNumberAsInt(numbers, i + 1);
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) { if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
result.add(numbers.get(i)); confirmedClusterNumbers.add(numbers.get(i));
} }
} }
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) { if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) {
result.add(numbers.get(numbers.size() - 1)); confirmedClusterNumbers.add(numbers.get(numbers.size() - 1));
} }
return result; return confirmedClusterNumbers;
}
private static int getLatestNumber(List<NumberWord> confirmedClusterNumbers) {
return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number();
} }
// Helper method to check if removing the current number results in a better order // Helper method to check if removing the current number results in a better order
public static boolean isBetterWithout(List<Word> numbers, int i) { public static boolean isBetterWithout(List<NumberWord> numbers, int i) {
if (i == 0 || i == numbers.size() - 1) { if (i == 0 || i == numbers.size() - 1) {
return false; return false;
@ -362,9 +419,9 @@ public class TableOfContentsClassificationService {
} }
private static int getNumberAsInt(List<Word> numbers, int i) { private static int getNumberAsInt(List<NumberWord> numbers, int i) {
return Integer.parseInt(numbers.get(i).toString()); return numbers.get(i).number();
} }
} }

View File

@ -12,33 +12,37 @@ import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.AbstractSemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContentsItem;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder; import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@ -55,17 +59,15 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
public class DocumentGraphFactory { public class DocumentGraphFactory {
public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) { public DocumentWithVisualization buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
Document documentGraph = new Document(); Document documentGraph = new Document();
documentGraph.setLayoutDebugLayer(document.getLayoutDebugLayer());
Context context = new Context(documentGraph); Context context = new Context(documentGraph);
document.getPages() document.getPages()
.forEach(context::buildAndAddPageWithCounter); .forEach(context::buildAndAddPageWithCounter);
addSectionsForToC(layoutParsingType, document, context, documentGraph); addSections(layoutParsingType, document, context, documentGraph);
addHeaderAndFooterToEachPage(document, context); addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size()); documentGraph.setNumberOfPages(context.pages.size());
@ -74,7 +76,7 @@ public class DocumentGraphFactory {
documentGraph.setTextBlock(documentGraph.getTextBlock()); documentGraph.setTextBlock(documentGraph.getTextBlock());
addTextBlocksToPages(documentGraph); addTextBlocksToPages(documentGraph);
return documentGraph; return new DocumentWithVisualization(documentGraph, document.getLayoutDebugLayer());
} }
@ -92,18 +94,18 @@ public class DocumentGraphFactory {
} }
private void addSectionsForToC(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) { private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { for (SectionTreeEntry sectionTreeEntry : classificationDocument.getSectionTree()) {
GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection(); GenericSemanticNode parent = sectionTreeEntry.getParent() == null ? null : sectionTreeEntry.getParent().getSection();
Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType, Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
parent, parent,
tocItem.getChildren().isEmpty(), sectionTreeEntry.getType(),
tocItem.getNonEmptySectionBlocks(), sectionTreeEntry.getNonEmptySectionBlocks(),
tocItem.getImages(), sectionTreeEntry.getImages(),
context, context,
document); document);
tocItem.setSection(section.orElse(null)); sectionTreeEntry.setSection(section.orElse(null));
} }
} }
@ -121,6 +123,8 @@ public class DocumentGraphFactory {
node = Headline.builder().documentTree(context.getDocumentTree()).build(); node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) { } else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build(); node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
} else if (Objects.equals(originalTextBlock.getClassification(), PageBlockType.TABLE_OF_CONTENTS_ITEM)) {
node = TableOfContentsItem.builder().documentTree(context.getDocumentTree()).build();
} else { } else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
} }
@ -274,7 +278,14 @@ public class DocumentGraphFactory {
public void buildAndAddPageWithCounter(ClassificationPage classificationPage) { public void buildAndAddPageWithCounter(ClassificationPage classificationPage) {
Page page = Page.fromClassificationPage(classificationPage); Page page = Page.builder()
.height((int) classificationPage.getPageHeight())
.width((int) classificationPage.getPageWidth())
.number(classificationPage.getPageNumber())
.rotation(classificationPage.getRotation())
.textBlocksOnPage(new LinkedList<>())
.build();
//this counter counts the TextBlocks per page //this counter counts the TextBlocks per page
//initial value is set to 1, because 0 is reserved for Header //initial value is set to 1, because 0 is reserved for Header
pages.put(page, 1); pages.put(page, 1);

View File

@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.Builder; import lombok.Builder;
@ -31,6 +31,8 @@ public class SearchTextWithTextPositionDto {
.lineBreaks(Collections.emptyList()) .lineBreaks(Collections.emptyList())
.positions(Collections.emptyList()) .positions(Collections.emptyList())
.stringIdxToPositionIdx(Collections.emptyList()) .stringIdxToPositionIdx(Collections.emptyList())
.boldTextBoundaries(Collections.emptyList())
.italicTextBoundaries(Collections.emptyList())
.build(); .build();
} }

View File

@ -10,8 +10,8 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Objects; import java.util.Objects;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;

View File

@ -10,14 +10,16 @@ import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.AbstractSemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility; import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
@ -29,7 +31,7 @@ public class SectionNodeFactory {
public Optional<GenericSemanticNode> addSection(LayoutParsingType layoutParsingType, public Optional<GenericSemanticNode> addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode, GenericSemanticNode parentNode,
boolean isLeaf, SectionTreeEntry.Type type,
List<AbstractPageBlock> pageBlocks, List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images, List<ClassifiedImage> images,
DocumentGraphFactory.Context context, DocumentGraphFactory.Context context,
@ -48,12 +50,11 @@ public class SectionNodeFactory {
return Optional.empty(); return Optional.empty();
} }
AbstractSemanticNode section; AbstractSemanticNode section = switch (type) {
if (isLeaf) { case SECTION -> Section.builder().documentTree(context.getDocumentTree()).build();
section = Section.builder().documentTree(context.getDocumentTree()).build(); case SUPER_SECTION -> SuperSection.builder().documentTree(context.getDocumentTree()).build();
} else { case TOC_SECTION -> TableOfContents.builder().documentTree(context.getDocumentTree()).build();
section = SuperSection.builder().documentTree(context.getDocumentTree()).build(); };
}
context.getSections().add(section); context.getSections().add(section);
@ -64,13 +65,14 @@ public class SectionNodeFactory {
if (containsTablesAndTextBlocks) { if (containsTablesAndTextBlocks) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
section, section,
true, SectionTreeEntry.Type.SECTION,
subSectionPageBlocks, subSectionPageBlocks,
emptyList(), emptyList(),
context, context,
document)); document));
} else if (!isLeaf) { } else if (type.equals(SectionTreeEntry.Type.SUPER_SECTION)) {
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document); // If a SuperSection contains more blocks than just a headline, we add a Section which contains the remaining textblocks.
addSection(layoutParsingType, section, SectionTreeEntry.Type.SECTION, pageBlocks, emptyList(), context, document);
} else { } else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document); addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
} }

View File

@ -6,14 +6,15 @@ import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
@ -120,7 +121,7 @@ public class TableNodeFactory {
} else if (firstTextBlockIsHeadline(cell)) { } else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType, SectionNodeFactory.addSection(layoutParsingType,
tableCell, tableCell,
true, SectionTreeEntry.Type.SECTION,
cell.getTextBlocks() cell.getTextBlocks()
.stream() .stream()
.map(tb -> (AbstractPageBlock) tb) .map(tb -> (AbstractPageBlock) tb)

View File

@ -2,10 +2,11 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory;
import java.util.List; import java.util.List;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;
@ -31,29 +32,20 @@ public class TextBlockFactory {
stringOffset += searchTextWithTextPositionDto.getSearchText().length(); stringOffset += searchTextWithTextPositionDto.getSearchText().length();
long idx = textBlockIdx; long idx = textBlockIdx;
textBlockIdx++; textBlockIdx++;
String orientation; String searchText = searchTextWithTextPositionDto.getSearchText();
int textRotation; return AtomicTextBlock.builder()
if (sequences.isEmpty()) { .id(idx)
orientation = null; .parent(parent)
textRotation = 0; .searchText(searchText)
} else { .numberOnPage(numberOnPage)
orientation = sequences.get(0).getDir().toString(); .page(page)
textRotation = sequences.get(0).getDir().getRotation(); .lineBreaks(searchTextWithTextPositionDto.getLineBreaks())
} .positions(searchTextWithTextPositionDto.getPositions())
var atb = AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(), .stringIdxToPositionIdx(searchTextWithTextPositionDto.getStringIdxToPositionIdx())
searchTextWithTextPositionDto.getLineBreaks(), .textRange(new TextRange(offset, offset + searchText.length()))
searchTextWithTextPositionDto.getBoldTextBoundaries(), .boldTextRanges(searchTextWithTextPositionDto.getBoldTextBoundaries())
searchTextWithTextPositionDto.getItalicTextBoundaries(), .italicTextRanges(searchTextWithTextPositionDto.getItalicTextBoundaries())
searchTextWithTextPositionDto.getPositions(), .build();
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
idx,
parent,
numberOnPage,
page,
offset,
orientation,
textRotation);
return atb;
} }

View File

@ -1,182 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.DocumentTextData;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentDataMapper {
public DocumentData toDocumentData(Document document) {
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.distinct()
.map(DocumentDataMapper::toAtomicTextBlockData)
.toList();
AllDocumentTextData allDocumentTextData = AllDocumentTextData.newBuilder().addAllDocumentTextData(documentTextData).build();
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.distinct()
.map(DocumentDataMapper::toAtomicPositionBlockData)
.toList();
AllDocumentPositionData allDocumentPositionData = AllDocumentPositionData.newBuilder().addAllDocumentPositionData(atomicPositionBlockData).build();
Set<Long> nonEmptyTextBlocks = documentTextData.stream()
.mapToLong(DocumentTextData::getId).boxed()
.collect(Collectors.toSet());
List<DocumentPage> documentPageData = document.getPages()
.stream()
.map(DocumentDataMapper::toPageData)
.toList();
AllDocumentPages allDocumentPages = AllDocumentPages.newBuilder().addAllDocumentPages(documentPageData).build();
DocumentStructureWrapper tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
return DocumentData.builder()
.documentTextData(allDocumentTextData)
.documentPositions(allDocumentPositionData)
.documentPages(allDocumentPages)
.documentStructureWrapper(tableOfContentsData)
.build();
}
private DocumentStructureWrapper toDocumentTreeData(DocumentTree documentTree) {
return new DocumentStructureWrapper(DocumentStructure.newBuilder().setRoot(toEntryData(documentTree.getRoot())).build());
}
private EntryData toEntryData(DocumentTree.Entry entry) {
List<Long> atomicTextBlocks;
if (entry.getNode().isLeaf()) {
atomicTextBlocks = toAtomicTextBlockIds(entry.getNode().getLeafTextBlock());
} else {
atomicTextBlocks = new ArrayList<>();
}
Map<String, String> properties = switch (entry.getType()) {
case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode());
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode());
case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode());
case PARAGRAPH ->
entry.getNode() instanceof DuplicatedParagraph duplicatedParagraph ? PropertiesMapper.buildDuplicateParagraphProperties(duplicatedParagraph) : new HashMap<>();
default -> new HashMap<>();
};
var documentBuilder = EntryData.newBuilder()
.addAllTreeId(entry.getTreeId())
.addAllChildren(entry.getChildren()
.stream()
.map(DocumentDataMapper::toEntryData)
.toList())
.setType(entry.getType())
.addAllAtomicBlockIds(atomicTextBlocks)
.addAllPageNumbers(entry.getNode().getPages()
.stream()
.map(Page::getNumber)
.map(Integer::longValue)
.toList())
.putAllProperties(properties);
if (entry.getNode() != null) {
documentBuilder.addAllEngines(entry.getNode().getEngines());
} else {
documentBuilder.addAllEngines(new HashSet<>(Set.of(LayoutEngine.ALGORITHM)));
}
return documentBuilder.build();
}
private List<Long> toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks()
.stream()
.map(AtomicTextBlock::getId)
.toList();
}
private DocumentPage toPageData(Page p) {
return DocumentPage.newBuilder().setRotation(p.getRotation()).setHeight(p.getHeight()).setWidth(p.getWidth()).setNumber(p.getNumber()).build();
}
private DocumentTextData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentTextData.newBuilder()
.setId(atomicTextBlock.getId())
.setPage(atomicTextBlock.getPage().getNumber().longValue())
.setSearchText(atomicTextBlock.getSearchText())
.setNumberOnPage(atomicTextBlock.getNumberOnPage())
.setStart(atomicTextBlock.getTextRange().start())
.setEnd(atomicTextBlock.getTextRange().end())
.addAllLineBreaks(atomicTextBlock.getLineBreaks())
.build();
}
private DocumentPositionData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentPositionData.newBuilder()
.setId(atomicTextBlock.getId())
.addAllPositions(toPositions(atomicTextBlock.getPositions()))
.addAllStringIdxToPositionIdx(atomicTextBlock.getStringIdxToPositionIdx())
.build();
}
private static List<Position> toPositions(List<Rectangle2D> rects) {
List<Position> positions = new ArrayList<>();
for (Rectangle2D rect : rects) {
positions.add(toPosition(rect));
}
return positions;
}
private static Position toPosition(Rectangle2D rect) {
return Position.newBuilder().addValue((float) rect.getMinX()).addValue((float) rect.getMinY()).addValue((float) rect.getWidth()).addValue((float) rect.getHeight()).build();
}
}

View File

@ -1,229 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentGraphMapper {
public Document toDocumentGraph(DocumentData documentData) {
Document document = new Document();
DocumentTree documentTree = new DocumentTree(document);
Context context = new Context(documentData, documentTree);
context.pages.addAll(documentData.getDocumentPages().getDocumentPagesList()
.stream()
.map(DocumentGraphMapper::buildPage)
.toList());
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildrenList(), context));
document.setDocumentTree(context.documentTree);
document.setPages(new HashSet<>(context.pages));
document.setNumberOfPages(documentData.getDocumentPages().getDocumentPagesCount());
document.setTextBlock(document.getTextBlock());
return document;
}
private List<DocumentTree.Entry> buildEntries(List<EntryData> entries, Context context) {
List<DocumentTree.Entry> newEntries = new LinkedList<>();
for (EntryData entryData : entries) {
List<Page> pages = entryData.getPageNumbersList()
.stream()
.map(pageNumber -> getPage(pageNumber, context))
.toList();
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case SUPER_SECTION -> buildSuperSection(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList());
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
if (entryData.getAtomicBlockIdsCount() > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIdsList(), context, node);
node.setLeafTextBlock(textBlock);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
case IMAGE -> pages.forEach(page -> page.getImages().add((Image) node));
default -> textBlock.getAtomicTextBlocks()
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
}
}
List<Integer> treeId = entryData.getTreeIdList();
node.setTreeId(treeId);
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildrenList(), context)).node(node).build());
}
return newEntries;
}
private Headline buildHeadline(Context context) {
return Headline.builder().documentTree(context.documentTree).build();
}
private Image buildImage(Context context, Map<String, String> properties, List<Long> pageNumbers) {
assert pageNumbers.size() == 1;
Page page = getPage(pageNumbers.get(0), context);
var builder = Image.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.documentTree(context.documentTree).page(page).build();
}
private TableCell buildTableCell(Context context, Map<String, String> properties) {
TableCell.TableCellBuilder<?, ?> builder = TableCell.builder();
PropertiesMapper.parseTableCellProperties(properties, builder);
return builder.documentTree(context.documentTree).build();
}
private Table buildTable(Context context, Map<String, String> properties) {
Table.TableBuilder builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.documentTree(context.documentTree).build();
}
private Footer buildFooter(Context context) {
return Footer.builder().documentTree(context.documentTree).build();
}
private Header buildHeader(Context context) {
return Header.builder().documentTree(context.documentTree).build();
}
private Section buildSection(Context context) {
return Section.builder().documentTree(context.documentTree).build();
}
private SuperSection buildSuperSection(Context context) {
return SuperSection.builder().documentTree(context.documentTree).build();
}
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
if (PropertiesMapper.isDuplicateParagraph(properties)) {
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
var unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
return duplicatedParagraph;
}
return Paragraph.builder().documentTree(context.documentTree).build();
}
private TextBlock toTextBlock(List<Long> atomicTextBlockIds, Context context, SemanticNode parent) {
return atomicTextBlockIds.stream()
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
.collect(new TextBlockCollector());
}
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.getDocumentTextDataList()
.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.getDocumentPositionDataList()
.get(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.documentTextDataBlockData.getDocumentTextDataList()
.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
}
private Page buildPage(DocumentPage p) {
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
}
private Page getPage(Long pageIndex, Context context) {
return context.pages.stream()
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
.findFirst()
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
}
static final class Context {
private final DocumentTree documentTree;
private final List<Page> pages;
private final AllDocumentTextData documentTextDataBlockData;
private final AllDocumentPositionData atomicPositionBlockData;
Context(DocumentData documentData, DocumentTree documentTree) {
this.documentTree = documentTree;
this.pages = new LinkedList<>();
this.documentTextDataBlockData = documentData.getDocumentTextData();
this.atomicPositionBlockData = documentData.getDocumentPositions();
}
}
}

View File

@ -26,12 +26,12 @@ import org.commonmark.node.StrongEmphasis;
import org.commonmark.node.Text; import org.commonmark.node.Text;
import org.commonmark.renderer.markdown.MarkdownRenderer; import org.commonmark.renderer.markdown.MarkdownRenderer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.AbstractNodeVisitor; import com.iqser.red.service.redaction.v1.server.model.document.AbstractNodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle; import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
public class MarkdownMapper extends AbstractNodeVisitor { public class MarkdownMapper extends AbstractNodeVisitor {
@ -39,7 +39,7 @@ public class MarkdownMapper extends AbstractNodeVisitor {
Document markdownDocument = new Document(); Document markdownDocument = new Document();
public String toMarkdownContent(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document document) { public String toMarkdownContent(com.iqser.red.service.redaction.v1.server.model.document.nodes.Document document) {
visit(document); visit(document);
@ -55,7 +55,7 @@ public class MarkdownMapper extends AbstractNodeVisitor {
@Override @Override
public void visit(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph paragraph) { public void visit(com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph paragraph) {
markdownDocument.appendChild(parseParagraph(paragraph)); markdownDocument.appendChild(parseParagraph(paragraph));
} }
@ -108,7 +108,7 @@ public class MarkdownMapper extends AbstractNodeVisitor {
} }
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) { private Node createTableCell(com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell tc) {
var cell = new TableCell(); var cell = new TableCell();
List<SemanticNode> childNodes = tc.streamChildren() List<SemanticNode> childNodes = tc.streamChildren()
@ -122,9 +122,9 @@ public class MarkdownMapper extends AbstractNodeVisitor {
} }
private Paragraph parseParagraph(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph paragraph) { private Paragraph parseParagraph(com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph paragraph) {
org.commonmark.node.Paragraph markdownParagraph = new org.commonmark.node.Paragraph(); Paragraph markdownParagraph = new org.commonmark.node.Paragraph();
parseTextBlock(paragraph.getTextBlock(), true).forEach(markdownParagraph::appendChild); parseTextBlock(paragraph.getTextBlock(), true).forEach(markdownParagraph::appendChild);
return markdownParagraph; return markdownParagraph;
} }
@ -230,12 +230,12 @@ public class MarkdownMapper extends AbstractNodeVisitor {
int start = textBlock.getTextRange().start(); int start = textBlock.getTextRange().start();
int end = textBlock.getTextRange().end(); int end = textBlock.getTextRange().end();
for (TextRange bold : textBlock.getBoldTextBoundaries()) { for (TextRange bold : textBlock.getBoldTextRanges()) {
styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD)); styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD));
styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD)); styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD));
} }
for (TextRange italic : textBlock.getItalicTextBoundaries()) { for (TextRange italic : textBlock.getItalicTextRanges()) {
styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC)); styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC));
styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC)); styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC));
} }
@ -298,7 +298,6 @@ public class MarkdownMapper extends AbstractNodeVisitor {
} }
record FontStyleChange(boolean enter, FontStyle style) { record FontStyleChange(boolean enter, FontStyle style) {
public static FontStyleChange enter(FontStyle style) { public static FontStyleChange enter(FontStyle style) {
@ -323,4 +322,5 @@ public class MarkdownMapper extends AbstractNodeVisitor {
record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) { record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) {
} }
} }

View File

@ -7,12 +7,12 @@ import java.util.LinkedList;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.viewerdoc.model.Outline; import com.knecon.fforesight.service.viewerdoc.model.Outline;

View File

@ -1,150 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
public class PropertiesMapper {
public static Map<String, String> buildImageProperties(Image image) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE, image.getImageType().toString());
properties.put(DocumentStructureWrapper.ImageProperties.TRANSPARENT, String.valueOf(image.isTransparent()));
properties.put(DocumentStructureWrapper.ImageProperties.POSITION, toString(image.getPosition()));
properties.put(DocumentStructureWrapper.ImageProperties.ID, image.getId());
properties.put(DocumentStructureWrapper.ImageProperties.REPRESENTATION_HASH, image.getRepresentationHash());
return properties;
}
public static Map<String, String> buildTableCellProperties(TableCell tableCell) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructureWrapper.TableCellProperties.ROW, String.valueOf(tableCell.getRow()));
properties.put(DocumentStructureWrapper.TableCellProperties.COL, String.valueOf(tableCell.getCol()));
properties.put(DocumentStructureWrapper.TableCellProperties.HEADER, String.valueOf(tableCell.isHeader()));
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
throw new IllegalArgumentException("TableCell can only occur on a single page!");
}
String bBoxString = toString(tableCell.getBBox()
.get(tableCell.getPages()
.stream()
.findFirst()
.get()));
properties.put(DocumentStructureWrapper.TableCellProperties.B_BOX, bBoxString);
return properties;
}
public static Map<String, String> buildTableProperties(Table table) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows()));
properties.put(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols()));
return properties;
}
public static void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
builder.imageType(parseImageType(properties.get(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE)));
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.ImageProperties.TRANSPARENT)));
builder.position(DocumentStructureWrapper.parseRectangle2D(properties.get(DocumentStructureWrapper.ImageProperties.POSITION)));
}
public static void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
builder.row(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.ROW)));
builder.col(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.COL)));
builder.header(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.TableCellProperties.HEADER)));
builder.bBox(DocumentStructureWrapper.parseRectangle2D(properties.get(DocumentStructureWrapper.TableCellProperties.B_BOX)));
}
public static void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS)));
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS)));
}
public static Map<String, String> buildDuplicateParagraphProperties(DuplicatedParagraph duplicatedParagraph) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID,
Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock())));
return properties;
}
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static List<Long> getUnsortedTextblockIds(Map<String, String> properties) {
return toLongList(properties.get(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static List<Long> toLongList(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
.map(Long::valueOf)
.toList();
}
private static ImageType parseImageType(String imageType) {
return switch (imageType) {
case "LOGO" -> ImageType.LOGO;
case "FORMULA" -> ImageType.FORMULA;
case "SIGNATURE" -> ImageType.SIGNATURE;
case "OCR" -> ImageType.OCR;
default -> ImageType.OTHER;
};
}
public static String toString(Rectangle2D rectangle2D) {
return String.format(Locale.US,
"%f%s%f%s%f%s%f",
rectangle2D.getX(),
DocumentStructureWrapper.RECTANGLE_DELIMITER,
rectangle2D.getY(),
DocumentStructureWrapper.RECTANGLE_DELIMITER,
rectangle2D.getWidth(),
DocumentStructureWrapper.RECTANGLE_DELIMITER,
rectangle2D.getHeight());
}
private static Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks()
.stream()
.map(AtomicTextBlock::getId)
.toArray(Long[]::new);
}
}

View File

@ -6,20 +6,21 @@ import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.IntStream; import java.util.stream.IntStream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType; import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ParagraphData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ParagraphData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
public class TaasDocumentDataMapper { public class TaasDocumentDataMapper {
@ -44,22 +45,26 @@ public class TaasDocumentDataMapper {
public static ParagraphData fromTextBlock(String classification, TextBlock textBlock) { public static ParagraphData fromTextBlock(String classification, TextBlock textBlock) {
return ParagraphData.builder() return ParagraphData.builder()
.boldTextBoundaries(textBlock.getBoldTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList()) .boldTextBoundaries(textBlock.getBoldTextRanges()
.italicTextBoundaries(textBlock.getItalicTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList()) .stream()
.map(b -> new Range(b.start(), b.end()))
.toList())
.italicTextBoundaries(textBlock.getItalicTextRanges()
.stream()
.map(b -> new Range(b.start(), b.end()))
.toList())
.text(textBlock.getSearchText()) .text(textBlock.getSearchText())
.linebreaks(textBlock.getLineBreaks()) .linebreaks(textBlock.getLineBreaks())
.classification(classification) .classification(classification)
.orientation(textBlock.getOrientation())
.textDirection(textBlock.getTextDirection())
.build(); .build();
} }
public static TableData fromTable(Table table) { public static TableData fromTable(Table table) {
List<RowData> rowData = IntStream.range(0, table.getNumberOfRows()) List<RowData> rowData = IntStream.range(0, table.getNumberOfRows()).boxed()
.boxed() .map(rowIdx -> table.streamRow(rowIdx)
.map(rowIdx -> table.streamRow(rowIdx).toList()) .toList())
.map(TaasDocumentDataMapper::fromTableCells) .map(TaasDocumentDataMapper::fromTableCells)
.toList(); .toList();
return new TableData(rowData, table.getNumberOfCols(), table.getNumberOfRows()); return new TableData(rowData, table.getNumberOfCols(), table.getNumberOfRows());
@ -71,14 +76,24 @@ public class TaasDocumentDataMapper {
if (tableCells.isEmpty()) { if (tableCells.isEmpty()) {
throw new IllegalArgumentException("no table cells provided"); throw new IllegalArgumentException("no table cells provided");
} }
boolean header = tableCells.stream().allMatch(TableCell::isHeader); boolean header = tableCells.stream()
.allMatch(TableCell::isHeader);
Page firstPage = tableCells.get(0).getFirstPage(); Page firstPage = tableCells.get(0).getFirstPage();
Rectangle2D bBox = tableCells.stream().map(TableCell::getBBox).reduce((map1, map2) -> { Rectangle2D bBox = tableCells.stream()
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D())); .map(TableCell::getBBox)
return map2; .reduce((map1, map2) -> {
}).orElseThrow().get(firstPage); map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
List<TextBlock> textBlocks = tableCells.stream().map(TableCell::getTextBlock).toList(); return map2;
return new RowData(header, textBlocks.stream().map(textBlock -> TaasDocumentDataMapper.fromTextBlock("table_cell", textBlock)).toList(), toFloatArray(bBox)); })
.orElseThrow().get(firstPage);
List<TextBlock> textBlocks = tableCells.stream()
.map(TableCell::getTextBlock)
.toList();
return new RowData(header,
textBlocks.stream()
.map(textBlock -> TaasDocumentDataMapper.fromTextBlock("table_cell", textBlock))
.toList(),
toFloatArray(bBox));
} }
@ -90,7 +105,9 @@ public class TaasDocumentDataMapper {
private static List<Range> toRange(List<TextRange> textRange) { private static List<Range> toRange(List<TextRange> textRange) {
return textRange.stream().map(TaasDocumentDataMapper::toRange).toList(); return textRange.stream()
.map(TaasDocumentDataMapper::toRange)
.toList();
} }
@ -102,7 +119,7 @@ public class TaasDocumentDataMapper {
.treeId(semanticNode.getTreeId()) .treeId(semanticNode.getTreeId())
.structureObjectNumber(structureObjectNumber) .structureObjectNumber(structureObjectNumber)
.boundingBox(toFloatArray(bBox)) .boundingBox(toFloatArray(bBox))
.stringOffset(semanticNode.getBoundary().start()) .stringOffset(semanticNode.getTextRange().start())
.page(page.getNumber()) .page(page.getNumber())
.paragraph(TaasDocumentDataMapper.fromTextBlock(semanticNode.getType().toString().toLowerCase(Locale.ROOT), semanticNode.getTextBlock())) .paragraph(TaasDocumentDataMapper.fromTextBlock(semanticNode.getType().toString().toLowerCase(Locale.ROOT), semanticNode.getTextBlock()))
.table(null) .table(null)
@ -118,7 +135,7 @@ public class TaasDocumentDataMapper {
.treeId(table.getTreeId()) .treeId(table.getTreeId())
.structureObjectNumber(structureObjectNumber) .structureObjectNumber(structureObjectNumber)
.boundingBox(toFloatArray(bBox)) .boundingBox(toFloatArray(bBox))
.stringOffset(table.getBoundary().start()) .stringOffset(table.getTextRange().start())
.page(page.getNumber()) .page(page.getNumber())
.paragraph(null) .paragraph(null)
.table(TaasDocumentDataMapper.fromTable(table)) .table(TaasDocumentDataMapper.fromTable(table))

View File

@ -5,11 +5,13 @@ import java.util.List;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.OutlineMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.OutlineMapper;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid;
import com.knecon.fforesight.service.viewerdoc.model.Outline; import com.knecon.fforesight.service.viewerdoc.model.Outline;
@ -31,35 +33,50 @@ public class LayoutGridService {
@SneakyThrows @SneakyThrows
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document") @Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) { public void addLayoutGrid(File originFile,
DocumentWithVisualization document,
File destinationFile,
LayoutParsingType layoutParsingType,
String layoutParserVersion,
boolean layerVisibilityDefaultValue) {
LayoutGrid layoutGrid = createLayoutGrid(document); String layoutParsingTypeName = layoutParsingType.name();
Outline outline = OutlineMapper.createOutline(document); LayoutGrid layoutGrid = createLayoutGrid(document.document(), layoutParserVersion, layoutParsingTypeName);
Outline outline = OutlineMapper.createOutline(document.document());
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue); layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
document.getLayoutDebugLayer().addSentenceVisualization(document.getTextBlock()); document.layoutDebugLayer().addSentenceVisualization(document.document().getTextBlock());
document.layoutDebugLayer().addOutlineHeadlines(document.document());
if (document.getLayoutDebugLayer().isActive()) { if (document.layoutDebugLayer().isActive()) {
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()), outline); viewerDocumentService.addLayerGroups(originFile,
destinationFile,
List.of(layoutGrid, document.layoutDebugLayer()),
layoutParserVersion,
layoutParsingTypeName,
outline);
} else { } else {
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid), outline); viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid), layoutParserVersion, layoutParsingTypeName, outline);
} }
} }
private LayoutGrid createLayoutGrid(Document document) { private LayoutGrid createLayoutGrid(Document document, String layoutParserVersion, String layoutParsingType) {
LayoutGrid layoutGrid = new LayoutGrid(); LayoutGrid layoutGrid = new LayoutGrid();
layoutGrid.addVersionAndLayoutParsingType(layoutParserVersion, layoutParsingType, document.getFirstPage());
document.streamAllSubNodes() document.streamAllSubNodes()
.peek(layoutGrid::addTreeId) .peek(layoutGrid::addTreeId)
.forEach(semanticNode -> { .forEach(semanticNode -> {
switch (semanticNode.getType()) { switch (semanticNode.getType()) {
case SECTION, SUPER_SECTION -> layoutGrid.addSection(semanticNode); case SECTION, SUPER_SECTION, TABLE_OF_CONTENTS -> layoutGrid.addSection(semanticNode);
case HEADLINE -> layoutGrid.addHeadline((Headline) semanticNode); case HEADLINE -> layoutGrid.addHeadline((Headline) semanticNode);
case PARAGRAPH -> layoutGrid.addParagraph((Paragraph) semanticNode); case PARAGRAPH -> layoutGrid.addParagraph((Paragraph) semanticNode);
case TABLE -> layoutGrid.addTable((Table) semanticNode); case TABLE -> layoutGrid.addTable((Table) semanticNode);
case IMAGE -> layoutGrid.addImage((Image) semanticNode); case IMAGE -> layoutGrid.addImage((Image) semanticNode);
case HEADER, FOOTER -> layoutGrid.addHeaderOrFooter(semanticNode); case HEADER, FOOTER -> layoutGrid.addHeaderOrFooter(semanticNode);
case TABLE_OF_CONTENTS_ITEM -> layoutGrid.addTableOfContentsItem(semanticNode);
} }
}); });
return layoutGrid; return layoutGrid;

View File

@ -7,7 +7,7 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;

View File

@ -9,7 +9,7 @@ import java.util.stream.Collectors;
import com.google.common.hash.HashFunction; import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -21,14 +21,18 @@ public final class IdBuilder {
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine) { public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine) {
return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine); return buildId(pages.stream()
.map(Page::getNumber)
.collect(Collectors.toList()), rectanglesPerLine);
} }
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine) { public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
List<Integer> sortedPageNumbers = pageNumbers.stream().sorted(Comparator.comparingInt(Integer::intValue)).toList(); List<Integer> sortedPageNumbers = pageNumbers.stream()
.sorted(Comparator.comparingInt(Integer::intValue))
.toList();
sortedPageNumbers.forEach(sb::append); sortedPageNumbers.forEach(sb::append);
rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX())) rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX()))
.append(Math.round(rectangle2D.getY())) .append(Math.round(rectangle2D.getY()))

View File

@ -5,7 +5,7 @@ import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) { public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) {

View File

@ -13,13 +13,12 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -111,13 +110,12 @@ public class PdfVisualisationUtility {
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) { return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
case DOCUMENT -> Color.LIGHT_GRAY; case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN; case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE; case PARAGRAPH, TABLE_OF_CONTENTS_ITEM -> Color.BLUE;
case SUPER_SECTION, SECTION -> Color.BLACK; case SUPER_SECTION, SECTION, TABLE_OF_CONTENTS -> Color.BLACK;
case HEADLINE -> Color.RED; case HEADLINE -> Color.RED;
case TABLE -> Color.ORANGE; case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY; case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA; case IMAGE -> Color.MAGENTA;
case UNRECOGNIZED -> Color.PINK;
}).build(); }).build();
} }

View File

@ -20,7 +20,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;

View File

@ -17,17 +17,23 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.data.LayoutEngineProto;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
@ -293,7 +299,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
} }
public void addTocPages(List<Word> numbers, int page) { public void addTocPages(List<NumberWord> numbers, int page) {
if (!active) { if (!active) {
return; return;
@ -302,13 +308,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages); VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
visualizationsOnPage.getColoredRectangles() visualizationsOnPage.getColoredRectangles()
.addAll(numbers.stream() .addAll(numbers.stream()
.map(NumberWord::word)
.map(BoundingBox::getBBoxPdf) .map(BoundingBox::getBBoxPdf)
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH)) .map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
.toList()); .toList());
visualizationsOnPage.getColoredRectangles()
.add(new ColoredRectangle(numbers.stream()
.map(BoundingBox::getBBoxPdf)
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
} }
@ -332,8 +335,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) { private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
if (!active) {
return;
}
int rectSize = 5; int rectSize = 5;
Point2D point2D; Point2D point2D;
if (outlineObject.getPoint().isPresent()) { if (outlineObject.getPoint().isPresent()) {
point2D = outlineObject.getPoint().get(); point2D = outlineObject.getPoint().get();
@ -357,10 +362,40 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) { public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
if (!active) {
return;
}
for (ListIdentifier listIdentifier : listIdentifiers) { for (ListIdentifier listIdentifier : listIdentifiers) {
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles() getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH)); .add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
} }
} }
public void addTocBlocks(Set<TextBlockOnPage> blocksWithNumberInCluster) {
if (!active) {
return;
}
for (TextBlockOnPage textBlockOnPage : blocksWithNumberInCluster) {
getOrCreateVisualizationsOnPage(textBlockOnPage.page().getPageNumber(), this.tocBlocks).getColoredRectangles()
.add(new ColoredRectangle(textBlockOnPage.textBlock().getBBoxPdf(), TOC_COLOR, LINE_WIDTH));
}
}
public void addOutlineHeadlines(Document document) {
if (!active) {
return;
}
document.streamAllSubNodes()
.filter(node -> node.getType().equals(NodeType.HEADLINE))
.filter(node -> node.getEngines().contains(LayoutEngine.OUTLINE))
.forEach(headline -> headline.getBBox()
.forEach((page, bbox) -> getOrCreateVisualizationsOnPage(page.getNumber(), this.outlineHeadlines).getColoredRectangles()
.add(new ColoredRectangle(bbox, HEADLINE_COLOR, LINE_WIDTH))));
}
} }

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.visualization; package com.knecon.fforesight.service.layoutparser.processor.visualization;
import java.awt.Color; import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D; import java.awt.geom.Line2D;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
@ -15,17 +16,19 @@ import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig; import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
@ -72,10 +75,12 @@ public class LayoutGrid extends LayoutGridLayerConfig {
public void addHeadline(Headline headline) { public void addHeadline(Headline headline) {
addAsRectangle(headline, headlines, HEADLINE_COLOR); if (headline.getParent().getType().equals(NodeType.TABLE_OF_CONTENTS)) {
if (headline.getEngines().contains(LayoutEngine.OUTLINE)) { addAsRectangle(headline, toc, HEADLINE_COLOR);
addAsRectangle(headline, outlineHeadlines, HEADLINE_COLOR); } else {
addAsRectangle(headline, headlines, HEADLINE_COLOR);
} }
} }
@ -124,8 +129,8 @@ public class LayoutGrid extends LayoutGridLayerConfig {
public void addSection(SemanticNode section) { public void addSection(SemanticNode section) {
Map<Page, Rectangle2D> bBoxMap = section.getBBox(); Map<Page, Rectangle2D> bBoxMap = section.getBBox();
Color color = section.getType().equals(NodeType.TABLE_OF_CONTENTS) ? TOC_COLOR : SECTION_COLOR;
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeTypeProto.NodeType.SECTION) List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeType.SECTION)
.toList(); .toList();
Integer maxChildDepth = subSections.stream() Integer maxChildDepth = subSections.stream()
.map(node -> node.getTreeId().size()) .map(node -> node.getTreeId().size())
@ -137,7 +142,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
String treeIdString = buildTreeIdString(section); String treeIdString = buildTreeIdString(section);
if (bBoxMap.values().size() == 1) { if (bBoxMap.values().size() == 1) {
handleSinglePage(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth); handleSinglePage(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth, color);
return; return;
} }
List<Page> pagesInOrder = bBoxMap.keySet() List<Page> pagesInOrder = bBoxMap.keySet()
@ -145,12 +150,12 @@ public class LayoutGrid extends LayoutGridLayerConfig {
.sorted(Comparator.comparingInt(Page::getNumber)) .sorted(Comparator.comparingInt(Page::getNumber))
.collect(Collectors.toList()); .collect(Collectors.toList());
pagesInOrder.remove(0); pagesInOrder.remove(0);
handleFirstPageOfSection(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth); handleFirstPageOfSection(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth, color);
for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) { for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) {
handleForMiddlePageOfSection(section, middlePage, bBoxMap.get(middlePage), treeIdString, maxChildDepth, ownDepth); handleForMiddlePageOfSection(section, middlePage, bBoxMap.get(middlePage), treeIdString, maxChildDepth, ownDepth, color);
} }
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1); var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
handleLastPageOfSection(section, lastPage, bBoxMap.get(lastPage), treeIdString, maxChildDepth, ownDepth); handleLastPageOfSection(section, lastPage, bBoxMap.get(lastPage), treeIdString, maxChildDepth, ownDepth, color);
} }
@ -232,33 +237,45 @@ public class LayoutGrid extends LayoutGridLayerConfig {
} }
private void handleSinglePage(SemanticNode semanticNode, Page page, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) { private void handleSinglePage(SemanticNode semanticNode, Page page, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth, Color color) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, maxChildDepth, ownDepth); RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// add string to top line // add string to top line
var firstLine = result.pageLines().remove(0); var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH)); result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) { for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH)); result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
} }
} }
private void handleFirstPageOfSection(SemanticNode semanticNode, Page firstPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) { private void handleFirstPageOfSection(SemanticNode semanticNode,
Page firstPage,
Rectangle2D rectangle2D,
String treeIdString,
Integer maxChildDepth,
Integer ownDepth,
Color color) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, maxChildDepth, ownDepth); RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// remove bottom line // remove bottom line
result.pageLines().remove(2); result.pageLines().remove(2);
// add string to top line // add string to top line
var firstLine = result.pageLines().remove(0); var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH)); result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) { for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH)); result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
} }
} }
private void handleForMiddlePageOfSection(SemanticNode semanticNode, Page middlePage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) { private void handleForMiddlePageOfSection(SemanticNode semanticNode,
Page middlePage,
Rectangle2D rectangle2D,
String treeIdString,
Integer maxChildDepth,
Integer ownDepth,
Color color) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, maxChildDepth, ownDepth); RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// remove top line // remove top line
@ -267,23 +284,29 @@ public class LayoutGrid extends LayoutGridLayerConfig {
result.pageLines().remove(1); result.pageLines().remove(1);
// add string to left line // add string to left line
var leftLine = result.pageLines().remove(1); var leftLine = result.pageLines().remove(1);
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH)); result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) { for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH)); result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
} }
} }
private void handleLastPageOfSection(SemanticNode semanticNode, Page lastPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) { private void handleLastPageOfSection(SemanticNode semanticNode,
Page lastPage,
Rectangle2D rectangle2D,
String treeIdString,
Integer maxChildDepth,
Integer ownDepth,
Color color) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, maxChildDepth, ownDepth); RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// remove top line // remove top line
result.pageLines().remove(0); result.pageLines().remove(0);
// add string to left line // add string to left line
var leftLine = result.pageLines().remove(2); var leftLine = result.pageLines().remove(2);
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH)); result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) { for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH)); result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
} }
} }
@ -295,14 +318,15 @@ public class LayoutGrid extends LayoutGridLayerConfig {
Integer maxChildDepth, Integer maxChildDepth,
Integer ownDepth) { Integer ownDepth) {
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), sections).getColoredLines(); Visualizations visualizations = semanticNode.getType().equals(NodeType.TABLE_OF_CONTENTS) ? toc : sections;
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredLines();
int lineWidthModifier = maxChildDepth - ownDepth; int lineWidthModifier = maxChildDepth - ownDepth;
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox() Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier)); .get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
SemanticNode highestParent = semanticNode.getHighestParent(); SemanticNode highestParent = semanticNode.getHighestParent();
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber())); Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, maxChildDepth, sections, SECTION_COLOR); addPlacedText(page, rectangle2D, highestParentRect, treeIdString, maxChildDepth, visualizations, SECTION_COLOR);
var lastPageLines = createLinesFromRectangle(r, page.getRotation()); var lastPageLines = createLinesFromRectangle(r, page.getRotation());
if (semanticNode instanceof SuperSection) { if (semanticNode instanceof SuperSection) {
@ -321,7 +345,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
for (Page page : table.getPages()) { for (Page page : table.getPages()) {
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0) Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
.filter(tableCell -> tableCell.isOnPage(page.getNumber())) .filter(tableCell -> tableCell.onPage(page.getNumber()))
.map(TableCell::getRow) .map(TableCell::getRow)
.findFirst(); .findFirst();
if (optionalFirstRowOnPage.isEmpty()) { if (optionalFirstRowOnPage.isEmpty()) {
@ -365,7 +389,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) { private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())) return table.filter(tableCell -> tableCell.onPage(page.getNumber()))
.map(TableCell::getBBox) .map(TableCell::getBBox)
.map(bBoxMap -> bBoxMap.get(page)); .map(bBoxMap -> bBoxMap.get(page));
} }
@ -384,6 +408,27 @@ public class LayoutGrid extends LayoutGridLayerConfig {
} }
public void addTableOfContentsItem(SemanticNode semanticNode) {
addAsRectangle(semanticNode, toc, PARAGRAPH_COLOR);
}
public void addVersionAndLayoutParsingType(String version, String layoutParsingType, Page page) {
PageInformation pageInformation = PageInformation.fromPage(page);
double startHeight = pageInformation.heightRot() - 5;
Point2D point1 = new Point2D.Double(0, startHeight);
Point2D point2 = new Point2D.Double(0, startHeight - FONT_SIZE * 1.5);
AffineTransform affineTransform = CoordinateTransforms.calculatePageCoordsToInitialUserSpaceCoords(pageInformation);
affineTransform.transform(point1, point1);
affineTransform.transform(point2, point2);
getOrCreateVisualizationsOnPage(page.getNumber(), this.versionAndType).getPlacedTexts()
.addAll(List.of(PlacedText.textFacingUp(String.valueOf(version), point1, FONT_SIZE, Color.BLACK, FONT),
PlacedText.textFacingUp(String.valueOf(layoutParsingType), point2, FONT_SIZE, Color.BLACK, FONT)));
}
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) { private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
} }

View File

@ -30,7 +30,7 @@ dependencies {
implementation(project(":layoutparser-service-internal-api")) implementation(project(":layoutparser-service-internal-api"))
implementation("com.iqser.red.commons:storage-commons:2.50.0") implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("com.knecon.fforesight:tenant-commons:0.30.0") implementation("com.knecon.fforesight:tenant-commons:0.31.0")
implementation("com.knecon.fforesight:tracing-commons:0.5.0") implementation("com.knecon.fforesight:tracing-commons:0.5.0")
implementation("com.knecon.fforesight:lifecycle-commons:0.6.0") implementation("com.knecon.fforesight:lifecycle-commons:0.6.0")
implementation("org.springframework.boot:spring-boot-starter-actuator:${springBootStarterVersion}") implementation("org.springframework.boot:spring-boot-starter-actuator:${springBootStarterVersion}")
@ -39,13 +39,15 @@ dependencies {
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4") implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("net.logstash.logback:logstash-logback-encoder:7.4") implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("ch.qos.logback:logback-classic") implementation("ch.qos.logback:logback-classic")
api("com.iqser.red.commons:metric-commons:2.3.0")
implementation("com.pdftron:PDFNet:10.11.0") implementation("com.pdftron:PDFNet:10.11.0")
// for integration testing only // for integration testing only
testImplementation(project(":viewer-doc-processor")) testImplementation(project(":viewer-doc-processor"))
testImplementation(project(":layoutparser-service-internal-api")) testImplementation(project(":layoutparser-service-internal-api"))
testImplementation("com.google.protobuf:protobuf-java-util:4.27.1")
testImplementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}")
testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}") testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}") testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}")

Some files were not shown because too many files have changed in this diff Show More