RED-9123: Improve performance of re-analysis (Spike)

This commit is contained in:
Maverick Studer 2024-10-07 12:28:10 +02:00
parent 31de229fa5
commit fe2ed1807e
54 changed files with 12164 additions and 192 deletions

View File

@ -7,4 +7,5 @@ description = "layoutparser-service-internal-api"
dependencies {
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
}

View File

@ -2,6 +2,11 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -13,16 +18,26 @@ import lombok.experimental.FieldDefaults;
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@Schema(description = "Object containing the complete document layout parsing information. It is split into 4 categories, structure, text, positions and pages: " + "The document tree structure of SemanticNodes such as Section, Paragraph, Headline, etc. " + "The text, which is stored as separate blocks of data. " + "The text positions, which are also stored as separate blocks. The Blocks are equal to the text blocks in length and order. " + "The page information.")
@Schema(description = "Object containing the complete document layout parsing information. It is split into 4 categories, structure, text, positions and pages: "
+ "The document tree structure of SemanticNodes such as Section, Paragraph, Headline, etc. "
+ "The text, which is stored as separate blocks of data. "
+ "The text positions, which are also stored as separate blocks. The Blocks are equal to the text blocks in length and order. "
+ "The page information.")
public class DocumentData implements Serializable {
@Schema(description = "Contains information about the document's pages.")
DocumentPage[] documentPages;
AllDocumentPages documentPages;
@Schema(description = "Contains information about the document's text.")
DocumentTextData[] documentTextData;
AllDocumentTextData documentTextData;
@Schema(description = "Contains information about the document's text positions.")
DocumentPositionData[] documentPositions;
AllDocumentPositionData documentPositions;
@Schema(description = "Contains information about the document's semantic structure.")
DocumentStructure documentStructure;
DocumentStructureWrapper documentStructureWrapper;
public DocumentStructure getDocumentStructure() {
return documentStructureWrapper.getDocumentStructure();
}
}

View File

@ -10,6 +10,7 @@ import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor

View File

@ -10,6 +10,7 @@ import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor

View File

@ -16,6 +16,7 @@ import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor

View File

@ -0,0 +1,799 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: DocumentStructure.proto
// Protobuf Java Version: 4.27.1
@SuppressWarnings("all")
public final class DocumentStructureProto {
private DocumentStructureProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", DocumentStructureProto.class.getName());
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
}
public interface DocumentStructureOrBuilder extends
// @@protoc_insertion_point(interface_extends:DocumentStructure)
com.google.protobuf.MessageOrBuilder {
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return Whether the root field is set.
*/
boolean hasRoot();
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return The root.
*/
EntryDataProto.EntryData getRoot();
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
EntryDataProto.EntryDataOrBuilder getRootOrBuilder();
}
/**
* Protobuf type {@code DocumentStructure}
*/
public static final class DocumentStructure extends com.google.protobuf.GeneratedMessage implements
// @@protoc_insertion_point(message_implements:DocumentStructure)
DocumentStructureOrBuilder {
private static final long serialVersionUID = 0L;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", DocumentStructure.class.getName());
}
// Use DocumentStructure.newBuilder() to construct.
private DocumentStructure(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
super(builder);
}
private DocumentStructure() {
}
public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
return DocumentStructureProto.internal_static_DocumentStructure_descriptor;
}
@Override
protected FieldAccessorTable internalGetFieldAccessorTable() {
return DocumentStructureProto.internal_static_DocumentStructure_fieldAccessorTable.ensureFieldAccessorsInitialized(DocumentStructure.class, Builder.class);
}
private int bitField0_;
public static final int ROOT_FIELD_NUMBER = 1;
private EntryDataProto.EntryData root_;
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return Whether the root field is set.
*/
@Override
public boolean hasRoot() {
return ((bitField0_ & 0x00000001) != 0);
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return The root.
*/
@Override
public EntryDataProto.EntryData getRoot() {
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
@Override
public EntryDataProto.EntryDataOrBuilder getRootOrBuilder() {
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
}
private byte memoizedIsInitialized = -1;
@Override
public final boolean isInitialized() {
byte isInitialized = memoizedIsInitialized;
if (isInitialized == 1) {
return true;
}
if (isInitialized == 0) {
return false;
}
memoizedIsInitialized = 1;
return true;
}
@Override
public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
if (((bitField0_ & 0x00000001) != 0)) {
output.writeMessage(1, getRoot());
}
getUnknownFields().writeTo(output);
}
@Override
public int getSerializedSize() {
int size = memoizedSize;
if (size != -1) {
return size;
}
size = 0;
if (((bitField0_ & 0x00000001) != 0)) {
size += com.google.protobuf.CodedOutputStream.computeMessageSize(1, getRoot());
}
size += getUnknownFields().getSerializedSize();
memoizedSize = size;
return size;
}
@Override
public boolean equals(final Object obj) {
if (obj == this) {
return true;
}
if (!(obj instanceof DocumentStructure)) {
return super.equals(obj);
}
DocumentStructure other = (DocumentStructure) obj;
if (hasRoot() != other.hasRoot()) {
return false;
}
if (hasRoot()) {
if (!getRoot().equals(other.getRoot())) {
return false;
}
}
if (!getUnknownFields().equals(other.getUnknownFields())) {
return false;
}
return true;
}
@Override
public int hashCode() {
if (memoizedHashCode != 0) {
return memoizedHashCode;
}
int hash = 41;
hash = (19 * hash) + getDescriptor().hashCode();
if (hasRoot()) {
hash = (37 * hash) + ROOT_FIELD_NUMBER;
hash = (53 * hash) + getRoot().hashCode();
}
hash = (29 * hash) + getUnknownFields().hashCode();
memoizedHashCode = hash;
return hash;
}
public static DocumentStructure parseFrom(java.nio.ByteBuffer data) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static DocumentStructure parseFrom(java.nio.ByteBuffer data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static DocumentStructure parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static DocumentStructure parseFrom(com.google.protobuf.ByteString data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static DocumentStructure parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static DocumentStructure parseFrom(byte[] data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static DocumentStructure parseFrom(java.io.InputStream input) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input);
}
public static DocumentStructure parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input, extensionRegistry);
}
public static DocumentStructure parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseDelimitedWithIOException(PARSER, input);
}
public static DocumentStructure parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseDelimitedWithIOException(PARSER, input, extensionRegistry);
}
public static DocumentStructure parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input);
}
public static DocumentStructure parseFrom(com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input, extensionRegistry);
}
@Override
public Builder newBuilderForType() {return newBuilder();}
public static Builder newBuilder() {
return DEFAULT_INSTANCE.toBuilder();
}
public static Builder newBuilder(DocumentStructure prototype) {
return DEFAULT_INSTANCE.toBuilder().mergeFrom(prototype);
}
@Override
public Builder toBuilder() {
return this == DEFAULT_INSTANCE ? new Builder() : new Builder().mergeFrom(this);
}
@Override
protected Builder newBuilderForType(BuilderParent parent) {
Builder builder = new Builder(parent);
return builder;
}
/**
* Protobuf type {@code DocumentStructure}
*/
public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> implements
// @@protoc_insertion_point(builder_implements:DocumentStructure)
DocumentStructureOrBuilder {
public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
return DocumentStructureProto.internal_static_DocumentStructure_descriptor;
}
@Override
protected FieldAccessorTable internalGetFieldAccessorTable() {
return DocumentStructureProto.internal_static_DocumentStructure_fieldAccessorTable.ensureFieldAccessorsInitialized(DocumentStructure.class, Builder.class);
}
// Construct using DocumentStructureOuterClass.DocumentStructure.newBuilder()
private Builder() {
maybeForceBuilderInitialization();
}
private Builder(BuilderParent parent) {
super(parent);
maybeForceBuilderInitialization();
}
private void maybeForceBuilderInitialization() {
if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
getRootFieldBuilder();
}
}
@Override
public Builder clear() {
super.clear();
bitField0_ = 0;
root_ = null;
if (rootBuilder_ != null) {
rootBuilder_.dispose();
rootBuilder_ = null;
}
return this;
}
@Override
public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
return DocumentStructureProto.internal_static_DocumentStructure_descriptor;
}
@Override
public DocumentStructure getDefaultInstanceForType() {
return DocumentStructure.getDefaultInstance();
}
@Override
public DocumentStructure build() {
DocumentStructure result = buildPartial();
if (!result.isInitialized()) {
throw newUninitializedMessageException(result);
}
return result;
}
@Override
public DocumentStructure buildPartial() {
DocumentStructure result = new DocumentStructure(this);
if (bitField0_ != 0) {
buildPartial0(result);
}
onBuilt();
return result;
}
private void buildPartial0(DocumentStructure result) {
int from_bitField0_ = bitField0_;
int to_bitField0_ = 0;
if (((from_bitField0_ & 0x00000001) != 0)) {
result.root_ = rootBuilder_ == null ? root_ : rootBuilder_.build();
to_bitField0_ |= 0x00000001;
}
result.bitField0_ |= to_bitField0_;
}
@Override
public Builder mergeFrom(com.google.protobuf.Message other) {
if (other instanceof DocumentStructure) {
return mergeFrom((DocumentStructure) other);
} else {
super.mergeFrom(other);
return this;
}
}
public Builder mergeFrom(DocumentStructure other) {
if (other == DocumentStructure.getDefaultInstance()) {
return this;
}
if (other.hasRoot()) {
mergeRoot(other.getRoot());
}
this.mergeUnknownFields(other.getUnknownFields());
onChanged();
return this;
}
@Override
public final boolean isInitialized() {
return true;
}
@Override
public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
if (extensionRegistry == null) {
throw new NullPointerException();
}
try {
boolean done = false;
while (!done) {
int tag = input.readTag();
switch (tag) {
case 0:
done = true;
break;
case 10: {
input.readMessage(getRootFieldBuilder().getBuilder(), extensionRegistry);
bitField0_ |= 0x00000001;
break;
} // case 10
default: {
if (!super.parseUnknownField(input, extensionRegistry, tag)) {
done = true; // was an endgroup tag
}
break;
} // default:
} // switch (tag)
} // while (!done)
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
throw e.unwrapIOException();
} finally {
onChanged();
} // finally
return this;
}
private int bitField0_;
private EntryDataProto.EntryData root_;
private com.google.protobuf.SingleFieldBuilder<EntryDataProto.EntryData, EntryDataProto.EntryData.Builder, EntryDataProto.EntryDataOrBuilder> rootBuilder_;
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return Whether the root field is set.
*/
public boolean hasRoot() {
return ((bitField0_ & 0x00000001) != 0);
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*
* @return The root.
*/
public EntryDataProto.EntryData getRoot() {
if (rootBuilder_ == null) {
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
} else {
return rootBuilder_.getMessage();
}
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder setRoot(EntryDataProto.EntryData value) {
if (rootBuilder_ == null) {
if (value == null) {
throw new NullPointerException();
}
root_ = value;
} else {
rootBuilder_.setMessage(value);
}
bitField0_ |= 0x00000001;
onChanged();
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder setRoot(EntryDataProto.EntryData.Builder builderForValue) {
if (rootBuilder_ == null) {
root_ = builderForValue.build();
} else {
rootBuilder_.setMessage(builderForValue.build());
}
bitField0_ |= 0x00000001;
onChanged();
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder mergeRoot(EntryDataProto.EntryData value) {
if (rootBuilder_ == null) {
if (((bitField0_ & 0x00000001) != 0) && root_ != null && root_ != EntryDataProto.EntryData.getDefaultInstance()) {
getRootBuilder().mergeFrom(value);
} else {
root_ = value;
}
} else {
rootBuilder_.mergeFrom(value);
}
if (root_ != null) {
bitField0_ |= 0x00000001;
onChanged();
}
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder clearRoot() {
bitField0_ = (bitField0_ & ~0x00000001);
root_ = null;
if (rootBuilder_ != null) {
rootBuilder_.dispose();
rootBuilder_ = null;
}
onChanged();
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public EntryDataProto.EntryData.Builder getRootBuilder() {
bitField0_ |= 0x00000001;
onChanged();
return getRootFieldBuilder().getBuilder();
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public EntryDataProto.EntryDataOrBuilder getRootOrBuilder() {
if (rootBuilder_ != null) {
return rootBuilder_.getMessageOrBuilder();
} else {
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
}
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
private com.google.protobuf.SingleFieldBuilder<EntryDataProto.EntryData, EntryDataProto.EntryData.Builder, EntryDataProto.EntryDataOrBuilder> getRootFieldBuilder() {
if (rootBuilder_ == null) {
rootBuilder_ = new com.google.protobuf.SingleFieldBuilder<EntryDataProto.EntryData, EntryDataProto.EntryData.Builder, EntryDataProto.EntryDataOrBuilder>(getRoot(),
getParentForChildren(),
isClean());
root_ = null;
}
return rootBuilder_;
}
// @@protoc_insertion_point(builder_scope:DocumentStructure)
}
// @@protoc_insertion_point(class_scope:DocumentStructure)
private static final DocumentStructure DEFAULT_INSTANCE;
static {
DEFAULT_INSTANCE = new DocumentStructure();
}
public static DocumentStructure getDefaultInstance() {
return DEFAULT_INSTANCE;
}
private static final com.google.protobuf.Parser<DocumentStructure> PARSER = new com.google.protobuf.AbstractParser<DocumentStructure>() {
@Override
public DocumentStructure parsePartialFrom(com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
Builder builder = newBuilder();
try {
builder.mergeFrom(input, extensionRegistry);
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
throw e.setUnfinishedMessage(builder.buildPartial());
} catch (com.google.protobuf.UninitializedMessageException e) {
throw e.asInvalidProtocolBufferException().setUnfinishedMessage(builder.buildPartial());
} catch (java.io.IOException e) {
throw new com.google.protobuf.InvalidProtocolBufferException(e).setUnfinishedMessage(builder.buildPartial());
}
return builder.buildPartial();
}
};
public static com.google.protobuf.Parser<DocumentStructure> parser() {
return PARSER;
}
@Override
public com.google.protobuf.Parser<DocumentStructure> getParserForType() {
return PARSER;
}
@Override
public DocumentStructure getDefaultInstanceForType() {
return DEFAULT_INSTANCE;
}
}
private static final com.google.protobuf.Descriptors.Descriptor internal_static_DocumentStructure_descriptor;
private static final com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_DocumentStructure_fieldAccessorTable;
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
static {
String[] descriptorData = {"\n\027DocumentStructure.proto\032\017EntryData.pro"
+ "to\"-\n\021DocumentStructure\022\030\n\004root\030\001 \001(\0132\n."
+ "EntryDatab\006proto3"};
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData,
new com.google.protobuf.Descriptors.FileDescriptor[]{EntryDataProto.getDescriptor(),});
internal_static_DocumentStructure_descriptor = getDescriptor().getMessageTypes()
.get(0);
internal_static_DocumentStructure_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(internal_static_DocumentStructure_descriptor,
new String[]{"Root",});
descriptor.resolveAllFeaturesImmutable();
EntryDataProto.getDescriptor();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -0,0 +1,126 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import java.awt.geom.Rectangle2D;
import java.io.ObjectStreamException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Getter;
@Getter
@AllArgsConstructor
public class DocumentStructureWrapper implements Serializable {
private final DocumentStructure documentStructure;
@Schema(description = "Object containing the extra field names, a table has in its properties field.")
public static class TableProperties implements Serializable {
public static final String NUMBER_OF_ROWS = "numberOfRows";
public static final String NUMBER_OF_COLS = "numberOfCols";
}
@Schema(description = "Object containing the extra field names, an Image has in its properties field.")
public static class ImageProperties implements Serializable {
public static final String TRANSPARENT = "transparent";
public static final String IMAGE_TYPE = "imageType";
public static final String POSITION = "position";
public static final String ID = "id";
public static final String REPRESENTATION_HASH = "representationHash";
}
@Schema(description = "Object containing the extra field names, a table cell has in its properties field.")
public static class TableCellProperties implements Serializable {
public static final String B_BOX = "bBox";
public static final String ROW = "row";
public static final String COL = "col";
public static final String HEADER = "header";
}
@Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.")
public static class DuplicateParagraphProperties implements Serializable {
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
}
public static final String RECTANGLE_DELIMITER = ";";
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER))
.map(Float::parseFloat)
.toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public static double[] parseRepresentationVector(String representationHash) {
String[] stringArray = representationHash.split("[,\\s]+");
double[] doubleArray = new double[stringArray.length];
for (int i = 0; i < stringArray.length; i++) {
doubleArray[i] = Double.parseDouble(stringArray[i]);
}
return doubleArray;
}
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return documentStructure.getRoot();
}
EntryData entry = documentStructure.getRoot().getChildrenList()
.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.getChildrenList()
.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(documentStructure.getRoot()),
documentStructure.getRoot().getChildrenList()
.stream())
.flatMap(DocumentStructureWrapper::flatten);
}
public String toString() {
return String.join("\n",
streamAllEntries().map(EntryData::toString)
.toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry),
entry.getChildrenList()
.stream()
.flatMap(DocumentStructureWrapper::flatten));
}
}

View File

@ -10,6 +10,7 @@ import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
@Deprecated
public enum LayoutEngine {
ALGORITHM,
AI,

View File

@ -0,0 +1,193 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: LayoutEngine.proto
// Protobuf Java Version: 4.27.1
@SuppressWarnings("all")
public final class LayoutEngineProto {
private LayoutEngineProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", LayoutEngineProto.class.getName());
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code LayoutEngine}
*/
public enum LayoutEngine implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>ALGORITHM = 0;</code>
*/
ALGORITHM(0),
/**
* <code>AI = 1;</code>
*/
AI(1),
/**
* <code>OUTLINE = 2;</code>
*/
OUTLINE(2),
UNRECOGNIZED(-1),
;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", LayoutEngine.class.getName());
}
/**
* <code>ALGORITHM = 0;</code>
*/
public static final int ALGORITHM_VALUE = 0;
/**
* <code>AI = 1;</code>
*/
public static final int AI_VALUE = 1;
/**
* <code>OUTLINE = 2;</code>
*/
public static final int OUTLINE_VALUE = 2;
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
}
return value;
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
@Deprecated
public static LayoutEngine valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static LayoutEngine forNumber(int value) {
switch (value) {
case 0:
return ALGORITHM;
case 1:
return AI;
case 2:
return OUTLINE;
default:
return null;
}
}
public static com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>() {
public LayoutEngine findValueByNumber(int number) {
return LayoutEngine.forNumber(number);
}
};
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues()
.get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
return LayoutEngineProto.getDescriptor().getEnumTypes()
.get(0);
}
private static final LayoutEngine[] VALUES = values();
public static LayoutEngine valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private LayoutEngine(int value) {
this.value = value;
}
// @@protoc_insertion_point(enum_scope:LayoutEngine)
}
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
static {
String[] descriptorData = {"\n\022LayoutEngine.proto*2\n\014LayoutEngine\022\r\n\t" + "ALGORITHM\020\000\022\006\n\002AI\020\001\022\013\n\007OUTLINE\020\002b\006proto3"};
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
descriptor.resolveAllFeaturesImmutable();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import java.util.Locale;
@Deprecated
public enum NodeType implements Serializable {
DOCUMENT,
SECTION,

View File

@ -0,0 +1,274 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.util.Locale;
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: NodeType.proto
// Protobuf Java Version: 4.27.1
@SuppressWarnings("all")
public final class NodeTypeProto {
private NodeTypeProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", NodeTypeProto.class.getName());
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code NodeType}
*/
public enum NodeType implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>DOCUMENT = 0;</code>
*/
DOCUMENT(0),
/**
* <code>SECTION = 1;</code>
*/
SECTION(1),
/**
* <code>SUPER_SECTION = 2;</code>
*/
SUPER_SECTION(2),
/**
* <code>HEADLINE = 3;</code>
*/
HEADLINE(3),
/**
* <code>PARAGRAPH = 4;</code>
*/
PARAGRAPH(4),
/**
* <code>TABLE = 5;</code>
*/
TABLE(5),
/**
* <code>TABLE_CELL = 6;</code>
*/
TABLE_CELL(6),
/**
* <code>IMAGE = 7;</code>
*/
IMAGE(7),
/**
* <code>HEADER = 8;</code>
*/
HEADER(8),
/**
* <code>FOOTER = 9;</code>
*/
FOOTER(9),
UNRECOGNIZED(-1),
;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", NodeType.class.getName());
}
/**
* <code>DOCUMENT = 0;</code>
*/
public static final int DOCUMENT_VALUE = 0;
/**
* <code>SECTION = 1;</code>
*/
public static final int SECTION_VALUE = 1;
/**
* <code>SUPER_SECTION = 2;</code>
*/
public static final int SUPER_SECTION_VALUE = 2;
/**
* <code>HEADLINE = 3;</code>
*/
public static final int HEADLINE_VALUE = 3;
/**
* <code>PARAGRAPH = 4;</code>
*/
public static final int PARAGRAPH_VALUE = 4;
/**
* <code>TABLE = 5;</code>
*/
public static final int TABLE_VALUE = 5;
/**
* <code>TABLE_CELL = 6;</code>
*/
public static final int TABLE_CELL_VALUE = 6;
/**
* <code>IMAGE = 7;</code>
*/
public static final int IMAGE_VALUE = 7;
/**
* <code>HEADER = 8;</code>
*/
public static final int HEADER_VALUE = 8;
/**
* <code>FOOTER = 9;</code>
*/
public static final int FOOTER_VALUE = 9;
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
}
return value;
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
@Deprecated
public static NodeType valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static NodeType forNumber(int value) {
switch (value) {
case 0:
return DOCUMENT;
case 1:
return SECTION;
case 2:
return SUPER_SECTION;
case 3:
return HEADLINE;
case 4:
return PARAGRAPH;
case 5:
return TABLE;
case 6:
return TABLE_CELL;
case 7:
return IMAGE;
case 8:
return HEADER;
case 9:
return FOOTER;
default:
return null;
}
}
public static com.google.protobuf.Internal.EnumLiteMap<NodeType> internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<NodeType> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<NodeType>() {
public NodeType findValueByNumber(int number) {
return NodeType.forNumber(number);
}
};
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues()
.get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
return NodeTypeProto.getDescriptor().getEnumTypes()
.get(0);
}
private static final NodeType[] VALUES = values();
public static NodeType valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private NodeType(int value) {
this.value = value;
}
// @@protoc_insertion_point(enum_scope:NodeType)
}
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
static {
String[] descriptorData = {"\n\016NodeType.proto*\223\001\n\010NodeType\022\014\n\010DOCUMEN"
+ "T\020\000\022\013\n\007SECTION\020\001\022\021\n\rSUPER_SECTION\020\002\022\014\n\010H"
+ "EADLINE\020\003\022\r\n\tPARAGRAPH\020\004\022\t\n\005TABLE\020\005\022\016\n\nT"
+ "ABLE_CELL\020\006\022\t\n\005IMAGE\020\007\022\n\n\006HEADER\020\010\022\n\n\006FO"
+ "OTER\020\tb\006proto3"};
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
descriptor.resolveAllFeaturesImmutable();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -0,0 +1,21 @@
syntax = "proto3";
message AllDocumentPages {
repeated DocumentPage documentPages = 1;
}
message DocumentPage {
// The page number, starting with 1.
int32 number = 1;
// The page height in PDF user units.
int32 height = 2;
// The page width in PDF user units.
int32 width = 3;
// The page rotation as specified by the PDF.
int32 rotation = 4;
}

View File

@ -0,0 +1,25 @@
syntax = "proto3";
message AllDocumentPositionData {
repeated DocumentPositionData documentPositionData = 1;
}
message DocumentPositionData {
// Identifier of the text block.
int64 id = 1;
// For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate.
// This is required due to the text and position coordinates not being equal.
repeated int32 stringIdxToPositionIdx = 2;
// The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block.
// The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner.
// In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.
repeated Position positions = 3;
// Definition of a BoundingBox that contains x, y, width, and height.
message Position {
repeated float value = 1;
}
}

View File

@ -0,0 +1,8 @@
syntax = "proto3";
import "EntryData.proto";
message DocumentStructure {
// The root EntryData represents the Document.
EntryData root = 1;
}

View File

@ -0,0 +1,29 @@
syntax = "proto3";
message AllDocumentTextData {
repeated DocumentTextData documentTextData = 1;
}
message DocumentTextData {
// Identifier of the text block.
int64 id = 1;
// The page the text block occurs on.
int64 page = 2;
// The text of the text block.
string searchText = 3;
// Each text block is assigned a number on a page, starting from 0.
int32 numberOnPage = 4;
// The text blocks are ordered, this number represents the start of the text block as a string offset.
int32 start = 5;
// The text blocks are ordered, this number represents the end of the text block as a string offset.
int32 end = 6;
// The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.
repeated int32 lineBreaks = 7;
}

View File

@ -0,0 +1,27 @@
syntax = "proto3";
import "LayoutEngine.proto";
import "NodeType.proto";
message EntryData {
// Type of the semantic node.
NodeType type = 1;
// Specifies the position in the parsed tree structure.
repeated int32 treeId = 2;
// Specifies the text block IDs associated with this semantic node.
repeated int64 atomicBlockIds = 3;
// Specifies the pages this semantic node appears on.
repeated int64 pageNumbers = 4;
// Some semantic nodes have additional information, this information is stored in this Map.
map<string, string> properties = 5;
// All child Entries of this Entry.
repeated EntryData children = 6;
// Describes the origin of the semantic node.
repeated LayoutEngine engines = 7;
}

View File

@ -0,0 +1,7 @@
syntax = "proto3";
enum LayoutEngine {
ALGORITHM = 0;
AI = 1;
OUTLINE = 2;
}

View File

@ -0,0 +1,14 @@
syntax = "proto3";
enum NodeType {
DOCUMENT = 0;
SECTION = 1;
SUPER_SECTION = 2;
HEADLINE = 3;
PARAGRAPH = 4;
TABLE = 5;
TABLE_CELL = 6;
IMAGE = 7;
HEADER = 8;
FOOTER = 9;
}

View File

@ -16,8 +16,10 @@ dependencies {
exclude("org.springframework.boot", "spring-boot-starter-security")
exclude("org.springframework.boot", "spring-boot-starter-validation")
}
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
implementation("com.iqser.red.commons:storage-commons:2.45.0")
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
exclude("com.iqser.red.commons", "storage-commons")
}
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
@ -34,4 +36,5 @@ dependencies {
implementation("com.pdftron:PDFNet:10.11.0")
implementation("org.apache.commons:commons-text:1.12.0")
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
}

View File

@ -20,7 +20,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;

View File

@ -11,7 +11,9 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import org.springframework.core.task.TaskExecutor;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
@ -39,6 +41,8 @@ public class LayoutParsingStorageService {
private final StorageService storageService;
private final ObjectMapper objectMapper;
private final TaskExecutor taskExecutor;
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
public File getOriginFile(String storageId) throws IOException {
@ -100,13 +104,35 @@ public class LayoutParsingStorageService {
}
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages());
Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.structureFileStorageId(),
documentData.getDocumentStructure());
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.textBlockFileStorageId(),
documentData.getDocumentTextData());
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
documentData.getDocumentPositions());
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.pageFileStorageId(),
documentData.getDocumentPages());
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
}

View File

@ -4,7 +4,7 @@ import java.util.HashSet;
import java.util.Set;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;

View File

@ -8,7 +8,7 @@ import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;

View File

@ -6,7 +6,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;

View File

@ -9,7 +9,7 @@ import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;

View File

@ -1,6 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;

View File

@ -1,6 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;

View File

@ -1,6 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;

View File

@ -6,7 +6,7 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;

View File

@ -1,6 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import lombok.AccessLevel;

View File

@ -12,8 +12,8 @@ import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;

View File

@ -1,6 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import lombok.AccessLevel;

View File

@ -12,8 +12,8 @@ import java.util.Set;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;

View File

@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.DocumentTextData;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
@ -13,8 +14,8 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
@ -153,11 +154,9 @@ public class AtomicTextBlock implements TextBlock {
.page(page)
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
.searchText(documentTextData.getSearchText())
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed()
.toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed()
.toList())
.positions(toRectangle2DList(documentPositionData.getPositions()))
.lineBreaks(documentTextData.getLineBreaksList())
.stringIdxToPositionIdx(documentPositionData.getStringIdxToPositionIdxList())
.positions(toRectangle2DList(documentPositionData.getPositionsList()))
.parent(parent)
.build();
}
@ -171,6 +170,14 @@ public class AtomicTextBlock implements TextBlock {
}
private static List<Rectangle2D> toRectangle2DList(List<Position> positions) {
return positions.stream()
.map(pos -> (Rectangle2D) new Rectangle2D.Float(pos.getValue(0), pos.getValue(1), pos.getValue(2), pos.getValue(3)))
.toList();
}
public CharSequence getLine(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {

View File

@ -11,7 +11,7 @@ import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;

View File

@ -1,6 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.DocumentTextData;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@ -9,11 +13,15 @@ import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
@ -32,44 +40,57 @@ public class DocumentDataMapper {
public DocumentData toDocumentData(Document document) {
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.distinct()
.map(DocumentDataMapper::toAtomicTextBlockData)
.toList();
AllDocumentTextData allDocumentTextData = AllDocumentTextData.newBuilder().addAllDocumentTextData(documentTextData).build();
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.distinct()
.map(DocumentDataMapper::toAtomicPositionBlockData)
.toList();
Set<Long> nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet());
AllDocumentPositionData allDocumentPositionData = AllDocumentPositionData.newBuilder().addAllDocumentPositionData(atomicPositionBlockData).build();
List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList();
DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
Set<Long> nonEmptyTextBlocks = documentTextData.stream()
.mapToLong(DocumentTextData::getId).boxed()
.collect(Collectors.toSet());
List<DocumentPage> documentPageData = document.getPages()
.stream()
.map(DocumentDataMapper::toPageData)
.toList();
AllDocumentPages allDocumentPages = AllDocumentPages.newBuilder().addAllDocumentPages(documentPageData).build();
DocumentStructureWrapper tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
return DocumentData.builder()
.documentTextData(documentTextData.toArray(new DocumentTextData[0]))
.documentPositions(atomicPositionBlockData.toArray(new DocumentPositionData[0]))
.documentPages(documentPageData.toArray(new DocumentPage[0]))
.documentStructure(tableOfContentsData)
.documentTextData(allDocumentTextData)
.documentPositions(allDocumentPositionData)
.documentPages(allDocumentPages)
.documentStructureWrapper(tableOfContentsData)
.build();
}
private DocumentStructure toDocumentTreeData(DocumentTree documentTree) {
private DocumentStructureWrapper toDocumentTreeData(DocumentTree documentTree) {
return new DocumentStructure(toEntryData(documentTree.getRoot()));
return new DocumentStructureWrapper(DocumentStructure.newBuilder().setRoot(toEntryData(documentTree.getRoot())).build());
}
private DocumentStructure.EntryData toEntryData(DocumentTree.Entry entry) {
private EntryData toEntryData(DocumentTree.Entry entry) {
Long[] atomicTextBlocks;
List<Long> atomicTextBlocks;
if (entry.getNode().isLeaf()) {
atomicTextBlocks = toAtomicTextBlockIds(entry.getNode().getLeafTextBlock());
} else {
atomicTextBlocks = new Long[]{};
atomicTextBlocks = new ArrayList<>();
}
Map<String, String> properties = switch (entry.getType()) {
@ -81,77 +102,81 @@ public class DocumentDataMapper {
default -> new HashMap<>();
};
DocumentStructure.EntryData.EntryDataBuilder documentBuilder = DocumentStructure.EntryData.builder()
.treeId(toPrimitiveIntArray(entry.getTreeId()))
.children(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList())
.type(entry.getType())
.atomicBlockIds(atomicTextBlocks)
.pageNumbers(entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new))
.properties(properties);
var documentBuilder = EntryData.newBuilder()
.addAllTreeId(entry.getTreeId())
.addAllChildren(entry.getChildren()
.stream()
.map(DocumentDataMapper::toEntryData)
.toList())
.setType(entry.getType())
.addAllAtomicBlockIds(atomicTextBlocks)
.addAllPageNumbers(entry.getNode().getPages()
.stream()
.map(Page::getNumber)
.map(Integer::longValue)
.toList())
.putAllProperties(properties);
if (entry.getNode() != null) {
documentBuilder.engines(entry.getNode().getEngines());
documentBuilder.addAllEngines(entry.getNode().getEngines());
} else {
documentBuilder.engines(new HashSet<>(Set.of(LayoutEngine.ALGORITHM)));
documentBuilder.addAllEngines(new HashSet<>(Set.of(LayoutEngine.ALGORITHM)));
}
return documentBuilder.build();
}
private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
private List<Long> toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
return textBlock.getAtomicTextBlocks()
.stream()
.map(AtomicTextBlock::getId)
.toList();
}
private DocumentPage toPageData(Page p) {
return DocumentPage.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).build();
return DocumentPage.newBuilder().setRotation(p.getRotation()).setHeight(p.getHeight()).setWidth(p.getWidth()).setNumber(p.getNumber()).build();
}
private DocumentTextData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentTextData.builder()
.id(atomicTextBlock.getId())
.page(atomicTextBlock.getPage().getNumber().longValue())
.searchText(atomicTextBlock.getSearchText())
.numberOnPage(atomicTextBlock.getNumberOnPage())
.start(atomicTextBlock.getTextRange().start())
.end(atomicTextBlock.getTextRange().end())
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
return DocumentTextData.newBuilder()
.setId(atomicTextBlock.getId())
.setPage(atomicTextBlock.getPage().getNumber().longValue())
.setSearchText(atomicTextBlock.getSearchText())
.setNumberOnPage(atomicTextBlock.getNumberOnPage())
.setStart(atomicTextBlock.getTextRange().start())
.setEnd(atomicTextBlock.getTextRange().end())
.addAllLineBreaks(atomicTextBlock.getLineBreaks())
.build();
}
private DocumentPositionData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentPositionData.builder()
.id(atomicTextBlock.getId())
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
.stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx()))
return DocumentPositionData.newBuilder()
.setId(atomicTextBlock.getId())
.addAllPositions(toPositions(atomicTextBlock.getPositions()))
.addAllStringIdxToPositionIdx(atomicTextBlock.getStringIdxToPositionIdx())
.build();
}
private static float[][] toPrimitiveFloatMatrix(List<Rectangle2D> positions) {
private static List<Position> toPositions(List<Rectangle2D> rects) {
float[][] positionMatrix = new float[positions.size()][];
for (int i = 0; i < positions.size(); i++) {
positionMatrix[i] = toArray(positions.get(i));
List<Position> positions = new ArrayList<>();
for (Rectangle2D rect : rects) {
positions.add(toPosition(rect));
}
return positionMatrix;
return positions;
}
private static float[] toArray(Rectangle2D positions) {
private static Position toPosition(Rectangle2D rect) {
return new float[]{(float) positions.getMinX(), (float) positions.getMinY(), (float) positions.getWidth(), (float) positions.getHeight()};
}
private int[] toPrimitiveIntArray(List<Integer> list) {
return list.stream().mapToInt(Integer::intValue).toArray();
return Position.newBuilder().addValue((float) rect.getMinX()).addValue((float) rect.getMinY()).addValue((float) rect.getWidth()).addValue((float) rect.getHeight()).build();
}
}

View File

@ -8,10 +8,10 @@ import java.util.Map;
import java.util.NoSuchElementException;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
@ -41,27 +41,29 @@ public class DocumentGraphMapper {
DocumentTree documentTree = new DocumentTree(document);
Context context = new Context(documentData, documentTree);
context.pages.addAll(Arrays.stream(documentData.getDocumentPages())
context.pages.addAll(documentData.getDocumentPages().getDocumentPagesList()
.stream()
.map(DocumentGraphMapper::buildPage)
.toList());
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildrenList(), context));
document.setDocumentTree(context.documentTree);
document.setPages(new HashSet<>(context.pages));
document.setNumberOfPages(documentData.getDocumentPages().length);
document.setNumberOfPages(documentData.getDocumentPages().getDocumentPagesCount());
document.setTextBlock(document.getTextBlock());
return document;
}
private List<DocumentTree.Entry> buildEntries(List<DocumentStructure.EntryData> entries, Context context) {
private List<DocumentTree.Entry> buildEntries(List<EntryData> entries, Context context) {
List<DocumentTree.Entry> newEntries = new LinkedList<>();
for (DocumentStructure.EntryData entryData : entries) {
for (EntryData entryData : entries) {
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
List<Page> pages = entryData.getPageNumbersList()
.stream()
.map(pageNumber -> getPage(pageNumber, context))
.toList();
@ -74,12 +76,12 @@ public class DocumentGraphMapper {
case FOOTER -> buildFooter(context);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList());
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
if (entryData.getAtomicBlockIds().length > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
if (entryData.getAtomicBlockIdsCount() > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIdsList(), context, node);
node.setLeafTextBlock(textBlock);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
@ -89,11 +91,10 @@ public class DocumentGraphMapper {
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
}
}
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
.toList();
List<Integer> treeId = entryData.getTreeIdList();
node.setTreeId(treeId);
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildrenList(), context)).node(node).build());
}
return newEntries;
}
@ -105,10 +106,10 @@ public class DocumentGraphMapper {
}
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
private Image buildImage(Context context, Map<String, String> properties, List<Long> pageNumbers) {
assert pageNumbers.length == 1;
Page page = getPage(pageNumbers[0], context);
assert pageNumbers.size() == 1;
Page page = getPage(pageNumbers.get(0), context);
var builder = Image.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.documentTree(context.documentTree).page(page).build();
@ -161,7 +162,7 @@ public class DocumentGraphMapper {
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
var unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
return duplicatedParagraph;
@ -171,9 +172,9 @@ public class DocumentGraphMapper {
}
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
private TextBlock toTextBlock(List<Long> atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds)
return atomicTextBlockIds.stream()
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
.collect(new TextBlockCollector());
}
@ -181,10 +182,13 @@ public class DocumentGraphMapper {
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.getDocumentTextDataList()
.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.getDocumentPositionDataList()
.get(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
getPage(context.documentTextDataBlockData.getDocumentTextDataList()
.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
}
@ -207,18 +211,16 @@ public class DocumentGraphMapper {
private final DocumentTree documentTree;
private final List<Page> pages;
private final List<DocumentTextData> documentTextDataBlockData;
private final List<DocumentPositionData> atomicPositionBlockData;
private final AllDocumentTextData documentTextDataBlockData;
private final AllDocumentPositionData atomicPositionBlockData;
Context(DocumentData documentData, DocumentTree documentTree) {
this.documentTree = documentTree;
this.pages = new LinkedList<>();
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData())
.toList();
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions())
.toList();
this.documentTextDataBlockData = documentData.getDocumentTextData();
this.atomicPositionBlockData = documentData.getDocumentPositions();
}

View File

@ -3,10 +3,11 @@ package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
@ -20,11 +21,11 @@ public class PropertiesMapper {
public static Map<String, String> buildImageProperties(Image image) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructure.ImageProperties.IMAGE_TYPE, image.getImageType().toString());
properties.put(DocumentStructure.ImageProperties.TRANSPARENT, String.valueOf(image.isTransparent()));
properties.put(DocumentStructure.ImageProperties.POSITION, toString(image.getPosition()));
properties.put(DocumentStructure.ImageProperties.ID, image.getId());
properties.put(DocumentStructure.ImageProperties.REPRESENTATION_HASH, image.getRepresentationHash());
properties.put(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE, image.getImageType().toString());
properties.put(DocumentStructureWrapper.ImageProperties.TRANSPARENT, String.valueOf(image.isTransparent()));
properties.put(DocumentStructureWrapper.ImageProperties.POSITION, toString(image.getPosition()));
properties.put(DocumentStructureWrapper.ImageProperties.ID, image.getId());
properties.put(DocumentStructureWrapper.ImageProperties.REPRESENTATION_HASH, image.getRepresentationHash());
return properties;
}
@ -32,15 +33,19 @@ public class PropertiesMapper {
public static Map<String, String> buildTableCellProperties(TableCell tableCell) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructure.TableCellProperties.ROW, String.valueOf(tableCell.getRow()));
properties.put(DocumentStructure.TableCellProperties.COL, String.valueOf(tableCell.getCol()));
properties.put(DocumentStructure.TableCellProperties.HEADER, String.valueOf(tableCell.isHeader()));
properties.put(DocumentStructureWrapper.TableCellProperties.ROW, String.valueOf(tableCell.getRow()));
properties.put(DocumentStructureWrapper.TableCellProperties.COL, String.valueOf(tableCell.getCol()));
properties.put(DocumentStructureWrapper.TableCellProperties.HEADER, String.valueOf(tableCell.isHeader()));
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
throw new IllegalArgumentException("TableCell can only occur on a single page!");
}
String bBoxString = toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
properties.put(DocumentStructure.TableCellProperties.B_BOX, bBoxString);
String bBoxString = toString(tableCell.getBBox()
.get(tableCell.getPages()
.stream()
.findFirst()
.get()));
properties.put(DocumentStructureWrapper.TableCellProperties.B_BOX, bBoxString);
return properties;
}
@ -49,59 +54,62 @@ public class PropertiesMapper {
public static Map<String, String> buildTableProperties(Table table) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructure.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows()));
properties.put(DocumentStructure.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols()));
properties.put(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows()));
properties.put(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols()));
return properties;
}
public static void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
builder.imageType(parseImageType(properties.get(DocumentStructure.ImageProperties.IMAGE_TYPE)));
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructure.ImageProperties.TRANSPARENT)));
builder.position(DocumentStructure.parseRectangle2D(properties.get(DocumentStructure.ImageProperties.POSITION)));
builder.imageType(parseImageType(properties.get(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE)));
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.ImageProperties.TRANSPARENT)));
builder.position(DocumentStructureWrapper.parseRectangle2D(properties.get(DocumentStructureWrapper.ImageProperties.POSITION)));
}
public static void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
builder.row(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.ROW)));
builder.col(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.COL)));
builder.header(Boolean.parseBoolean(properties.get(DocumentStructure.TableCellProperties.HEADER)));
builder.bBox(DocumentStructure.parseRectangle2D(properties.get(DocumentStructure.TableCellProperties.B_BOX)));
builder.row(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.ROW)));
builder.col(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.COL)));
builder.header(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.TableCellProperties.HEADER)));
builder.bBox(DocumentStructureWrapper.parseRectangle2D(properties.get(DocumentStructureWrapper.TableCellProperties.B_BOX)));
}
public static void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS)));
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS)));
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS)));
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS)));
}
public static Map<String, String> buildDuplicateParagraphProperties(DuplicatedParagraph duplicatedParagraph) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID, Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock())));
properties.put(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID,
Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock())));
return properties;
}
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
return properties.containsKey(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
public static List<Long> getUnsortedTextblockIds(Map<String, String> properties) {
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
return toLongList(properties.get(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static Long[] toLongArray(String ids) {
public static List<Long> toLongList(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
.map(Long::valueOf)
.toList();
}
@ -120,20 +128,23 @@ public class PropertiesMapper {
public static String toString(Rectangle2D rectangle2D) {
return String.format(Locale.US,
"%f%s%f%s%f%s%f",
rectangle2D.getX(),
DocumentStructure.RECTANGLE_DELIMITER,
rectangle2D.getY(),
DocumentStructure.RECTANGLE_DELIMITER,
rectangle2D.getWidth(),
DocumentStructure.RECTANGLE_DELIMITER,
rectangle2D.getHeight());
"%f%s%f%s%f%s%f",
rectangle2D.getX(),
DocumentStructureWrapper.RECTANGLE_DELIMITER,
rectangle2D.getY(),
DocumentStructureWrapper.RECTANGLE_DELIMITER,
rectangle2D.getWidth(),
DocumentStructureWrapper.RECTANGLE_DELIMITER,
rectangle2D.getHeight());
}
private static Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
return textBlock.getAtomicTextBlocks()
.stream()
.map(AtomicTextBlock::getId)
.toArray(Long[]::new);
}
}

View File

@ -6,7 +6,7 @@ import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.IntStream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ParagraphData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;

View File

@ -14,7 +14,7 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
@ -117,6 +117,7 @@ public class PdfVisualisationUtility {
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;
case UNRECOGNIZED -> Color.PINK;
}).build();
}

View File

@ -0,0 +1,42 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import com.google.protobuf.Message;
import com.google.protobuf.MessageOrBuilder;
import com.google.protobuf.Struct;
import com.google.protobuf.util.JsonFormat;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ProtobufUtil {
public static String toJson(MessageOrBuilder messageOrBuilder) throws IOException {
return JsonFormat.printer().print(messageOrBuilder);
}
@SuppressWarnings("unchecked")
public static Message fromJson(String json) throws IOException {
Message.Builder structBuilder = Struct.newBuilder();
JsonFormat.parser().ignoringUnknownFields().merge(json, structBuilder);
return structBuilder.build();
}
@SneakyThrows
public <T extends Message> File serializeToTempFile(T any) {
var tempFile = File.createTempFile("storage-protobuf", ".data");
try (var fos = new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile)))) {
any.writeTo(fos);
return tempFile;
}
}
}

View File

@ -16,7 +16,7 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
@ -125,7 +125,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
Map<Page, Rectangle2D> bBoxMap = section.getBBox();
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeType.SECTION)
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeTypeProto.NodeType.SECTION)
.toList();
Integer maxChildDepth = subSections.stream()
.map(node -> node.getTreeId().size())

View File

@ -29,7 +29,7 @@ dependencies {
implementation(project(":layoutparser-service-processor"))
implementation(project(":layoutparser-service-internal-api"))
implementation("com.iqser.red.commons:storage-commons:2.45.0")
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
implementation("com.knecon.fforesight:lifecycle-commons:0.6.0")
@ -38,11 +38,14 @@ dependencies {
implementation("com.amazonaws:aws-java-sdk-s3:1.12.536")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("ch.qos.logback:logback-classic")
implementation("com.pdftron:PDFNet:10.11.0")
// for integration testing only
testImplementation(project(":viewer-doc-processor"))
testImplementation(project(":layoutparser-service-internal-api"))
testImplementation("com.google.protobuf:protobuf-java-util:4.27.1")
testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}")

View File

@ -70,7 +70,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
var stem = Path.of("/tmp/DocumentGraphJsonWritingTest");
stem.toFile().mkdirs();
var tmpFilePath = stem.resolve(filename.getFileName());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_structure" + ".json")), documentData.getDocumentStructure());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_structure" + ".json")), documentData.getDocumentStructureWrapper());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_text" + ".json")), documentData.getDocumentTextData());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_positions" + ".json")), documentData.getDocumentPositions());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_pages" + ".json")), documentData.getDocumentPages());

View File

@ -6,11 +6,13 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
@ -32,21 +34,21 @@ public class DocumentGraphMappingTest extends BuildDocumentTest {
Document document = buildGraph(filename);
DocumentData documentData = DocumentDataMapper.toDocumentData(document);
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", documentData.getDocumentPages());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTextData());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".json", documentData.getDocumentPositions());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", documentData.getDocumentStructure());
storageService.storeProtoObject(TenantContext.getTenantId(), filename + "_PAGES" + ".proto", documentData.getDocumentPages());
storageService.storeProtoObject(TenantContext.getTenantId(), filename + "_TEXT" + ".proto", documentData.getDocumentTextData());
storageService.storeProtoObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".proto", documentData.getDocumentPositions());
storageService.storeProtoObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".proto", documentData.getDocumentStructure());
DocumentPage[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", DocumentPage[].class);
DocumentTextData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentTextData[].class);
DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
filename + "_POSITIONS" + ".json",
DocumentPositionData[].class);
DocumentStructure documentTreeData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", DocumentStructure.class);
AllDocumentPages pageData = storageService.readProtoObject(TenantContext.getTenantId(), filename + "_PAGES" + ".proto", AllDocumentPages.parser());
AllDocumentTextData atomicTextBlockData = storageService.readProtoObject(TenantContext.getTenantId(), filename + "_TEXT" + ".proto", AllDocumentTextData.parser());
AllDocumentPositionData atomicPositionBlockData = storageService.readProtoObject(TenantContext.getTenantId(),
filename + "_POSITIONS" + ".proto",
AllDocumentPositionData.parser());
DocumentStructure documentTreeData = storageService.readProtoObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".proto", DocumentStructure.parser());
DocumentData documentData2 = DocumentData.builder()
.documentPages(pageData)
.documentStructure(documentTreeData)
.documentStructureWrapper(new DocumentStructureWrapper(documentTreeData))
.documentTextData(atomicTextBlockData)
.documentPositions(atomicPositionBlockData)
.build();
@ -73,10 +75,9 @@ public class DocumentGraphMappingTest extends BuildDocumentTest {
private static boolean allTablesHavePositiveNumberOfRowsAndColumns(DocumentData documentData) {
return documentData.getDocumentStructure()
.streamAllEntries()
return documentData.getDocumentStructureWrapper().streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(EntryData::getPropertiesMap)
.map(properties -> {
var builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);

View File

@ -17,8 +17,9 @@ import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
@ -119,7 +120,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
Map.of("file",filename.toFile().toString())));
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) {
if (!compareStructures(documentDataBefore.getDocumentStructureWrapper(), documentDataAfter.getDocumentStructureWrapper())) {
String tmpFileNameBefore = "/tmp/before." + filename.getFileName().toString();
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
@ -136,11 +137,11 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2) {
private boolean compareStructures(DocumentStructureWrapper structure1, DocumentStructureWrapper structure2) {
List<Table> listStructure1 = structure1.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(EntryData::getPropertiesMap)
.map(properties -> {
var builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
@ -150,7 +151,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
List<Table> listStructure2 = structure2.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(EntryData::getPropertiesMap)
.map(properties -> {
var builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);

View File

@ -17,7 +17,7 @@ import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.util.Matrix;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
@ -233,6 +233,7 @@ public class PdfDraw {
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;
case UNRECOGNIZED -> Color.PINK;
}).build();
}