RED-9123: Improve performance of re-analysis (Spike)
This commit is contained in:
parent
31de229fa5
commit
fe2ed1807e
@ -7,4 +7,5 @@ description = "layoutparser-service-internal-api"
|
||||
|
||||
dependencies {
|
||||
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
|
||||
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
|
||||
}
|
||||
|
||||
@ -2,6 +2,11 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -13,16 +18,26 @@ import lombok.experimental.FieldDefaults;
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@Schema(description = "Object containing the complete document layout parsing information. It is split into 4 categories, structure, text, positions and pages: " + "The document tree structure of SemanticNodes such as Section, Paragraph, Headline, etc. " + "The text, which is stored as separate blocks of data. " + "The text positions, which are also stored as separate blocks. The Blocks are equal to the text blocks in length and order. " + "The page information.")
|
||||
@Schema(description = "Object containing the complete document layout parsing information. It is split into 4 categories, structure, text, positions and pages: "
|
||||
+ "The document tree structure of SemanticNodes such as Section, Paragraph, Headline, etc. "
|
||||
+ "The text, which is stored as separate blocks of data. "
|
||||
+ "The text positions, which are also stored as separate blocks. The Blocks are equal to the text blocks in length and order. "
|
||||
+ "The page information.")
|
||||
public class DocumentData implements Serializable {
|
||||
|
||||
@Schema(description = "Contains information about the document's pages.")
|
||||
DocumentPage[] documentPages;
|
||||
AllDocumentPages documentPages;
|
||||
@Schema(description = "Contains information about the document's text.")
|
||||
DocumentTextData[] documentTextData;
|
||||
AllDocumentTextData documentTextData;
|
||||
@Schema(description = "Contains information about the document's text positions.")
|
||||
DocumentPositionData[] documentPositions;
|
||||
AllDocumentPositionData documentPositions;
|
||||
@Schema(description = "Contains information about the document's semantic structure.")
|
||||
DocumentStructure documentStructure;
|
||||
DocumentStructureWrapper documentStructureWrapper;
|
||||
|
||||
|
||||
public DocumentStructure getDocumentStructure() {
|
||||
|
||||
return documentStructureWrapper.getDocumentStructure();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -10,6 +10,7 @@ import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Deprecated
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -10,6 +10,7 @@ import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Deprecated
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -16,6 +16,7 @@ import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Deprecated
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
|
||||
@ -0,0 +1,799 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
// Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
// NO CHECKED-IN PROTOBUF GENCODE
|
||||
// source: DocumentStructure.proto
|
||||
// Protobuf Java Version: 4.27.1
|
||||
@SuppressWarnings("all")
|
||||
public final class DocumentStructureProto {
|
||||
|
||||
private DocumentStructureProto() {}
|
||||
|
||||
|
||||
static {
|
||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||
/* major= */ 4,
|
||||
/* minor= */ 27,
|
||||
/* patch= */ 1,
|
||||
/* suffix= */ "", DocumentStructureProto.class.getName());
|
||||
}
|
||||
|
||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
|
||||
|
||||
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
|
||||
}
|
||||
|
||||
|
||||
public interface DocumentStructureOrBuilder extends
|
||||
// @@protoc_insertion_point(interface_extends:DocumentStructure)
|
||||
com.google.protobuf.MessageOrBuilder {
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*
|
||||
* @return Whether the root field is set.
|
||||
*/
|
||||
boolean hasRoot();
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*
|
||||
* @return The root.
|
||||
*/
|
||||
EntryDataProto.EntryData getRoot();
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*/
|
||||
EntryDataProto.EntryDataOrBuilder getRootOrBuilder();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Protobuf type {@code DocumentStructure}
|
||||
*/
|
||||
public static final class DocumentStructure extends com.google.protobuf.GeneratedMessage implements
|
||||
// @@protoc_insertion_point(message_implements:DocumentStructure)
|
||||
DocumentStructureOrBuilder {
|
||||
|
||||
private static final long serialVersionUID = 0L;
|
||||
|
||||
static {
|
||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||
/* major= */ 4,
|
||||
/* minor= */ 27,
|
||||
/* patch= */ 1,
|
||||
/* suffix= */ "", DocumentStructure.class.getName());
|
||||
}
|
||||
|
||||
// Use DocumentStructure.newBuilder() to construct.
|
||||
private DocumentStructure(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
|
||||
|
||||
super(builder);
|
||||
}
|
||||
|
||||
|
||||
private DocumentStructure() {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
|
||||
|
||||
return DocumentStructureProto.internal_static_DocumentStructure_descriptor;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected FieldAccessorTable internalGetFieldAccessorTable() {
|
||||
|
||||
return DocumentStructureProto.internal_static_DocumentStructure_fieldAccessorTable.ensureFieldAccessorsInitialized(DocumentStructure.class, Builder.class);
|
||||
}
|
||||
|
||||
|
||||
private int bitField0_;
|
||||
public static final int ROOT_FIELD_NUMBER = 1;
|
||||
private EntryDataProto.EntryData root_;
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*
|
||||
* @return Whether the root field is set.
|
||||
*/
|
||||
@Override
|
||||
public boolean hasRoot() {
|
||||
|
||||
return ((bitField0_ & 0x00000001) != 0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*
|
||||
* @return The root.
|
||||
*/
|
||||
@Override
|
||||
public EntryDataProto.EntryData getRoot() {
|
||||
|
||||
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*/
|
||||
@Override
|
||||
public EntryDataProto.EntryDataOrBuilder getRootOrBuilder() {
|
||||
|
||||
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
|
||||
}
|
||||
|
||||
|
||||
private byte memoizedIsInitialized = -1;
|
||||
|
||||
|
||||
@Override
|
||||
public final boolean isInitialized() {
|
||||
|
||||
byte isInitialized = memoizedIsInitialized;
|
||||
if (isInitialized == 1) {
|
||||
return true;
|
||||
}
|
||||
if (isInitialized == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
memoizedIsInitialized = 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
|
||||
|
||||
if (((bitField0_ & 0x00000001) != 0)) {
|
||||
output.writeMessage(1, getRoot());
|
||||
}
|
||||
getUnknownFields().writeTo(output);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getSerializedSize() {
|
||||
|
||||
int size = memoizedSize;
|
||||
if (size != -1) {
|
||||
return size;
|
||||
}
|
||||
|
||||
size = 0;
|
||||
if (((bitField0_ & 0x00000001) != 0)) {
|
||||
size += com.google.protobuf.CodedOutputStream.computeMessageSize(1, getRoot());
|
||||
}
|
||||
size += getUnknownFields().getSerializedSize();
|
||||
memoizedSize = size;
|
||||
return size;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
|
||||
if (obj == this) {
|
||||
return true;
|
||||
}
|
||||
if (!(obj instanceof DocumentStructure)) {
|
||||
return super.equals(obj);
|
||||
}
|
||||
DocumentStructure other = (DocumentStructure) obj;
|
||||
|
||||
if (hasRoot() != other.hasRoot()) {
|
||||
return false;
|
||||
}
|
||||
if (hasRoot()) {
|
||||
if (!getRoot().equals(other.getRoot())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!getUnknownFields().equals(other.getUnknownFields())) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
if (memoizedHashCode != 0) {
|
||||
return memoizedHashCode;
|
||||
}
|
||||
int hash = 41;
|
||||
hash = (19 * hash) + getDescriptor().hashCode();
|
||||
if (hasRoot()) {
|
||||
hash = (37 * hash) + ROOT_FIELD_NUMBER;
|
||||
hash = (53 * hash) + getRoot().hashCode();
|
||||
}
|
||||
hash = (29 * hash) + getUnknownFields().hashCode();
|
||||
memoizedHashCode = hash;
|
||||
return hash;
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(java.nio.ByteBuffer data) throws com.google.protobuf.InvalidProtocolBufferException {
|
||||
|
||||
return PARSER.parseFrom(data);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(java.nio.ByteBuffer data,
|
||||
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
|
||||
|
||||
return PARSER.parseFrom(data, extensionRegistry);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
|
||||
|
||||
return PARSER.parseFrom(data);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(com.google.protobuf.ByteString data,
|
||||
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
|
||||
|
||||
return PARSER.parseFrom(data, extensionRegistry);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
|
||||
|
||||
return PARSER.parseFrom(data);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(byte[] data,
|
||||
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
|
||||
|
||||
return PARSER.parseFrom(data, extensionRegistry);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(java.io.InputStream input) throws java.io.IOException {
|
||||
|
||||
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
|
||||
|
||||
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input, extensionRegistry);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
|
||||
|
||||
return com.google.protobuf.GeneratedMessage.parseDelimitedWithIOException(PARSER, input);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
|
||||
|
||||
return com.google.protobuf.GeneratedMessage.parseDelimitedWithIOException(PARSER, input, extensionRegistry);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
|
||||
|
||||
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input);
|
||||
}
|
||||
|
||||
|
||||
public static DocumentStructure parseFrom(com.google.protobuf.CodedInputStream input,
|
||||
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
|
||||
|
||||
return com.google.protobuf.GeneratedMessage.parseWithIOException(PARSER, input, extensionRegistry);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Builder newBuilderForType() {return newBuilder();}
|
||||
|
||||
|
||||
public static Builder newBuilder() {
|
||||
|
||||
return DEFAULT_INSTANCE.toBuilder();
|
||||
}
|
||||
|
||||
|
||||
public static Builder newBuilder(DocumentStructure prototype) {
|
||||
|
||||
return DEFAULT_INSTANCE.toBuilder().mergeFrom(prototype);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Builder toBuilder() {
|
||||
|
||||
return this == DEFAULT_INSTANCE ? new Builder() : new Builder().mergeFrom(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected Builder newBuilderForType(BuilderParent parent) {
|
||||
|
||||
Builder builder = new Builder(parent);
|
||||
return builder;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Protobuf type {@code DocumentStructure}
|
||||
*/
|
||||
public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> implements
|
||||
// @@protoc_insertion_point(builder_implements:DocumentStructure)
|
||||
DocumentStructureOrBuilder {
|
||||
|
||||
public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
|
||||
|
||||
return DocumentStructureProto.internal_static_DocumentStructure_descriptor;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected FieldAccessorTable internalGetFieldAccessorTable() {
|
||||
|
||||
return DocumentStructureProto.internal_static_DocumentStructure_fieldAccessorTable.ensureFieldAccessorsInitialized(DocumentStructure.class, Builder.class);
|
||||
}
|
||||
|
||||
|
||||
// Construct using DocumentStructureOuterClass.DocumentStructure.newBuilder()
|
||||
private Builder() {
|
||||
|
||||
maybeForceBuilderInitialization();
|
||||
}
|
||||
|
||||
|
||||
private Builder(BuilderParent parent) {
|
||||
|
||||
super(parent);
|
||||
maybeForceBuilderInitialization();
|
||||
}
|
||||
|
||||
|
||||
private void maybeForceBuilderInitialization() {
|
||||
|
||||
if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
|
||||
getRootFieldBuilder();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Builder clear() {
|
||||
|
||||
super.clear();
|
||||
bitField0_ = 0;
|
||||
root_ = null;
|
||||
if (rootBuilder_ != null) {
|
||||
rootBuilder_.dispose();
|
||||
rootBuilder_ = null;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
|
||||
|
||||
return DocumentStructureProto.internal_static_DocumentStructure_descriptor;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DocumentStructure getDefaultInstanceForType() {
|
||||
|
||||
return DocumentStructure.getDefaultInstance();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DocumentStructure build() {
|
||||
|
||||
DocumentStructure result = buildPartial();
|
||||
if (!result.isInitialized()) {
|
||||
throw newUninitializedMessageException(result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DocumentStructure buildPartial() {
|
||||
|
||||
DocumentStructure result = new DocumentStructure(this);
|
||||
if (bitField0_ != 0) {
|
||||
buildPartial0(result);
|
||||
}
|
||||
onBuilt();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private void buildPartial0(DocumentStructure result) {
|
||||
|
||||
int from_bitField0_ = bitField0_;
|
||||
int to_bitField0_ = 0;
|
||||
if (((from_bitField0_ & 0x00000001) != 0)) {
|
||||
result.root_ = rootBuilder_ == null ? root_ : rootBuilder_.build();
|
||||
to_bitField0_ |= 0x00000001;
|
||||
}
|
||||
result.bitField0_ |= to_bitField0_;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Builder mergeFrom(com.google.protobuf.Message other) {
|
||||
|
||||
if (other instanceof DocumentStructure) {
|
||||
return mergeFrom((DocumentStructure) other);
|
||||
} else {
|
||||
super.mergeFrom(other);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Builder mergeFrom(DocumentStructure other) {
|
||||
|
||||
if (other == DocumentStructure.getDefaultInstance()) {
|
||||
return this;
|
||||
}
|
||||
if (other.hasRoot()) {
|
||||
mergeRoot(other.getRoot());
|
||||
}
|
||||
this.mergeUnknownFields(other.getUnknownFields());
|
||||
onChanged();
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final boolean isInitialized() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException {
|
||||
|
||||
if (extensionRegistry == null) {
|
||||
throw new NullPointerException();
|
||||
}
|
||||
try {
|
||||
boolean done = false;
|
||||
while (!done) {
|
||||
int tag = input.readTag();
|
||||
switch (tag) {
|
||||
case 0:
|
||||
done = true;
|
||||
break;
|
||||
case 10: {
|
||||
input.readMessage(getRootFieldBuilder().getBuilder(), extensionRegistry);
|
||||
bitField0_ |= 0x00000001;
|
||||
break;
|
||||
} // case 10
|
||||
default: {
|
||||
if (!super.parseUnknownField(input, extensionRegistry, tag)) {
|
||||
done = true; // was an endgroup tag
|
||||
}
|
||||
break;
|
||||
} // default:
|
||||
} // switch (tag)
|
||||
} // while (!done)
|
||||
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
|
||||
throw e.unwrapIOException();
|
||||
} finally {
|
||||
onChanged();
|
||||
} // finally
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
private int bitField0_;
|
||||
|
||||
private EntryDataProto.EntryData root_;
|
||||
private com.google.protobuf.SingleFieldBuilder<EntryDataProto.EntryData, EntryDataProto.EntryData.Builder, EntryDataProto.EntryDataOrBuilder> rootBuilder_;
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*
|
||||
* @return Whether the root field is set.
|
||||
*/
|
||||
public boolean hasRoot() {
|
||||
|
||||
return ((bitField0_ & 0x00000001) != 0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*
|
||||
* @return The root.
|
||||
*/
|
||||
public EntryDataProto.EntryData getRoot() {
|
||||
|
||||
if (rootBuilder_ == null) {
|
||||
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
|
||||
} else {
|
||||
return rootBuilder_.getMessage();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*/
|
||||
public Builder setRoot(EntryDataProto.EntryData value) {
|
||||
|
||||
if (rootBuilder_ == null) {
|
||||
if (value == null) {
|
||||
throw new NullPointerException();
|
||||
}
|
||||
root_ = value;
|
||||
} else {
|
||||
rootBuilder_.setMessage(value);
|
||||
}
|
||||
bitField0_ |= 0x00000001;
|
||||
onChanged();
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*/
|
||||
public Builder setRoot(EntryDataProto.EntryData.Builder builderForValue) {
|
||||
|
||||
if (rootBuilder_ == null) {
|
||||
root_ = builderForValue.build();
|
||||
} else {
|
||||
rootBuilder_.setMessage(builderForValue.build());
|
||||
}
|
||||
bitField0_ |= 0x00000001;
|
||||
onChanged();
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*/
|
||||
public Builder mergeRoot(EntryDataProto.EntryData value) {
|
||||
|
||||
if (rootBuilder_ == null) {
|
||||
if (((bitField0_ & 0x00000001) != 0) && root_ != null && root_ != EntryDataProto.EntryData.getDefaultInstance()) {
|
||||
getRootBuilder().mergeFrom(value);
|
||||
} else {
|
||||
root_ = value;
|
||||
}
|
||||
} else {
|
||||
rootBuilder_.mergeFrom(value);
|
||||
}
|
||||
if (root_ != null) {
|
||||
bitField0_ |= 0x00000001;
|
||||
onChanged();
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*/
|
||||
public Builder clearRoot() {
|
||||
|
||||
bitField0_ = (bitField0_ & ~0x00000001);
|
||||
root_ = null;
|
||||
if (rootBuilder_ != null) {
|
||||
rootBuilder_.dispose();
|
||||
rootBuilder_ = null;
|
||||
}
|
||||
onChanged();
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*/
|
||||
public EntryDataProto.EntryData.Builder getRootBuilder() {
|
||||
|
||||
bitField0_ |= 0x00000001;
|
||||
onChanged();
|
||||
return getRootFieldBuilder().getBuilder();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*/
|
||||
public EntryDataProto.EntryDataOrBuilder getRootOrBuilder() {
|
||||
|
||||
if (rootBuilder_ != null) {
|
||||
return rootBuilder_.getMessageOrBuilder();
|
||||
} else {
|
||||
return root_ == null ? EntryDataProto.EntryData.getDefaultInstance() : root_;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* The root EntryData represents the Document.
|
||||
* </pre>
|
||||
*
|
||||
* <code>.EntryData root = 1;</code>
|
||||
*/
|
||||
private com.google.protobuf.SingleFieldBuilder<EntryDataProto.EntryData, EntryDataProto.EntryData.Builder, EntryDataProto.EntryDataOrBuilder> getRootFieldBuilder() {
|
||||
|
||||
if (rootBuilder_ == null) {
|
||||
rootBuilder_ = new com.google.protobuf.SingleFieldBuilder<EntryDataProto.EntryData, EntryDataProto.EntryData.Builder, EntryDataProto.EntryDataOrBuilder>(getRoot(),
|
||||
getParentForChildren(),
|
||||
isClean());
|
||||
root_ = null;
|
||||
}
|
||||
return rootBuilder_;
|
||||
}
|
||||
|
||||
// @@protoc_insertion_point(builder_scope:DocumentStructure)
|
||||
}
|
||||
|
||||
// @@protoc_insertion_point(class_scope:DocumentStructure)
|
||||
private static final DocumentStructure DEFAULT_INSTANCE;
|
||||
|
||||
static {
|
||||
DEFAULT_INSTANCE = new DocumentStructure();
|
||||
}
|
||||
|
||||
public static DocumentStructure getDefaultInstance() {
|
||||
|
||||
return DEFAULT_INSTANCE;
|
||||
}
|
||||
|
||||
|
||||
private static final com.google.protobuf.Parser<DocumentStructure> PARSER = new com.google.protobuf.AbstractParser<DocumentStructure>() {
|
||||
@Override
|
||||
public DocumentStructure parsePartialFrom(com.google.protobuf.CodedInputStream input,
|
||||
com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException {
|
||||
|
||||
Builder builder = newBuilder();
|
||||
try {
|
||||
builder.mergeFrom(input, extensionRegistry);
|
||||
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
|
||||
throw e.setUnfinishedMessage(builder.buildPartial());
|
||||
} catch (com.google.protobuf.UninitializedMessageException e) {
|
||||
throw e.asInvalidProtocolBufferException().setUnfinishedMessage(builder.buildPartial());
|
||||
} catch (java.io.IOException e) {
|
||||
throw new com.google.protobuf.InvalidProtocolBufferException(e).setUnfinishedMessage(builder.buildPartial());
|
||||
}
|
||||
return builder.buildPartial();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public static com.google.protobuf.Parser<DocumentStructure> parser() {
|
||||
|
||||
return PARSER;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public com.google.protobuf.Parser<DocumentStructure> getParserForType() {
|
||||
|
||||
return PARSER;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DocumentStructure getDefaultInstanceForType() {
|
||||
|
||||
return DEFAULT_INSTANCE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static final com.google.protobuf.Descriptors.Descriptor internal_static_DocumentStructure_descriptor;
|
||||
private static final com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_DocumentStructure_fieldAccessorTable;
|
||||
|
||||
|
||||
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
|
||||
|
||||
return descriptor;
|
||||
}
|
||||
|
||||
|
||||
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
|
||||
|
||||
static {
|
||||
String[] descriptorData = {"\n\027DocumentStructure.proto\032\017EntryData.pro"
|
||||
+ "to\"-\n\021DocumentStructure\022\030\n\004root\030\001 \001(\0132\n."
|
||||
+ "EntryDatab\006proto3"};
|
||||
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData,
|
||||
new com.google.protobuf.Descriptors.FileDescriptor[]{EntryDataProto.getDescriptor(),});
|
||||
internal_static_DocumentStructure_descriptor = getDescriptor().getMessageTypes()
|
||||
.get(0);
|
||||
internal_static_DocumentStructure_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(internal_static_DocumentStructure_descriptor,
|
||||
new String[]{"Root",});
|
||||
descriptor.resolveAllFeaturesImmutable();
|
||||
EntryDataProto.getDescriptor();
|
||||
}
|
||||
|
||||
// @@protoc_insertion_point(outer_class_scope)
|
||||
}
|
||||
@ -0,0 +1,126 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.ObjectStreamException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
public class DocumentStructureWrapper implements Serializable {
|
||||
|
||||
private final DocumentStructure documentStructure;
|
||||
|
||||
|
||||
@Schema(description = "Object containing the extra field names, a table has in its properties field.")
|
||||
public static class TableProperties implements Serializable {
|
||||
|
||||
public static final String NUMBER_OF_ROWS = "numberOfRows";
|
||||
public static final String NUMBER_OF_COLS = "numberOfCols";
|
||||
|
||||
}
|
||||
|
||||
@Schema(description = "Object containing the extra field names, an Image has in its properties field.")
|
||||
public static class ImageProperties implements Serializable {
|
||||
|
||||
public static final String TRANSPARENT = "transparent";
|
||||
public static final String IMAGE_TYPE = "imageType";
|
||||
public static final String POSITION = "position";
|
||||
public static final String ID = "id";
|
||||
|
||||
public static final String REPRESENTATION_HASH = "representationHash";
|
||||
|
||||
}
|
||||
|
||||
@Schema(description = "Object containing the extra field names, a table cell has in its properties field.")
|
||||
public static class TableCellProperties implements Serializable {
|
||||
|
||||
public static final String B_BOX = "bBox";
|
||||
public static final String ROW = "row";
|
||||
public static final String COL = "col";
|
||||
public static final String HEADER = "header";
|
||||
|
||||
}
|
||||
|
||||
@Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.")
|
||||
public static class DuplicateParagraphProperties implements Serializable {
|
||||
|
||||
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
|
||||
|
||||
}
|
||||
|
||||
public static final String RECTANGLE_DELIMITER = ";";
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER))
|
||||
.map(Float::parseFloat)
|
||||
.toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
public static double[] parseRepresentationVector(String representationHash) {
|
||||
|
||||
String[] stringArray = representationHash.split("[,\\s]+");
|
||||
double[] doubleArray = new double[stringArray.length];
|
||||
for (int i = 0; i < stringArray.length; i++) {
|
||||
doubleArray[i] = Double.parseDouble(stringArray[i]);
|
||||
}
|
||||
|
||||
return doubleArray;
|
||||
}
|
||||
|
||||
|
||||
public EntryData get(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
return documentStructure.getRoot();
|
||||
}
|
||||
EntryData entry = documentStructure.getRoot().getChildrenList()
|
||||
.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.getChildrenList()
|
||||
.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<EntryData> streamAllEntries() {
|
||||
|
||||
return Stream.concat(Stream.of(documentStructure.getRoot()),
|
||||
documentStructure.getRoot().getChildrenList()
|
||||
.stream())
|
||||
.flatMap(DocumentStructureWrapper::flatten);
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n",
|
||||
streamAllEntries().map(EntryData::toString)
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<EntryData> flatten(EntryData entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry),
|
||||
entry.getChildrenList()
|
||||
.stream()
|
||||
.flatMap(DocumentStructureWrapper::flatten));
|
||||
}
|
||||
|
||||
}
|
||||
@ -10,6 +10,7 @@ import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Deprecated
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
@Deprecated
|
||||
public enum LayoutEngine {
|
||||
ALGORITHM,
|
||||
AI,
|
||||
|
||||
@ -0,0 +1,193 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
// Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
// NO CHECKED-IN PROTOBUF GENCODE
|
||||
// source: LayoutEngine.proto
|
||||
// Protobuf Java Version: 4.27.1
|
||||
@SuppressWarnings("all")
|
||||
public final class LayoutEngineProto {
|
||||
|
||||
private LayoutEngineProto() {}
|
||||
|
||||
|
||||
static {
|
||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||
/* major= */ 4,
|
||||
/* minor= */ 27,
|
||||
/* patch= */ 1,
|
||||
/* suffix= */ "", LayoutEngineProto.class.getName());
|
||||
}
|
||||
|
||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
|
||||
|
||||
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Protobuf enum {@code LayoutEngine}
|
||||
*/
|
||||
public enum LayoutEngine implements com.google.protobuf.ProtocolMessageEnum {
|
||||
/**
|
||||
* <code>ALGORITHM = 0;</code>
|
||||
*/
|
||||
ALGORITHM(0),
|
||||
/**
|
||||
* <code>AI = 1;</code>
|
||||
*/
|
||||
AI(1),
|
||||
/**
|
||||
* <code>OUTLINE = 2;</code>
|
||||
*/
|
||||
OUTLINE(2),
|
||||
UNRECOGNIZED(-1),
|
||||
;
|
||||
|
||||
|
||||
static {
|
||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||
/* major= */ 4,
|
||||
/* minor= */ 27,
|
||||
/* patch= */ 1,
|
||||
/* suffix= */ "", LayoutEngine.class.getName());
|
||||
}
|
||||
|
||||
/**
|
||||
* <code>ALGORITHM = 0;</code>
|
||||
*/
|
||||
public static final int ALGORITHM_VALUE = 0;
|
||||
/**
|
||||
* <code>AI = 1;</code>
|
||||
*/
|
||||
public static final int AI_VALUE = 1;
|
||||
/**
|
||||
* <code>OUTLINE = 2;</code>
|
||||
*/
|
||||
public static final int OUTLINE_VALUE = 2;
|
||||
|
||||
|
||||
public final int getNumber() {
|
||||
|
||||
if (this == UNRECOGNIZED) {
|
||||
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param value The numeric wire value of the corresponding enum entry.
|
||||
* @return The enum associated with the given numeric wire value.
|
||||
* @deprecated Use {@link #forNumber(int)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public static LayoutEngine valueOf(int value) {
|
||||
|
||||
return forNumber(value);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param value The numeric wire value of the corresponding enum entry.
|
||||
* @return The enum associated with the given numeric wire value.
|
||||
*/
|
||||
public static LayoutEngine forNumber(int value) {
|
||||
|
||||
switch (value) {
|
||||
case 0:
|
||||
return ALGORITHM;
|
||||
case 1:
|
||||
return AI;
|
||||
case 2:
|
||||
return OUTLINE;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalGetValueMap() {
|
||||
|
||||
return internalValueMap;
|
||||
}
|
||||
|
||||
|
||||
private static final com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>() {
|
||||
public LayoutEngine findValueByNumber(int number) {
|
||||
|
||||
return LayoutEngine.forNumber(number);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
|
||||
|
||||
if (this == UNRECOGNIZED) {
|
||||
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
|
||||
}
|
||||
return getDescriptor().getValues()
|
||||
.get(ordinal());
|
||||
}
|
||||
|
||||
|
||||
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
|
||||
|
||||
return getDescriptor();
|
||||
}
|
||||
|
||||
|
||||
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
|
||||
|
||||
return LayoutEngineProto.getDescriptor().getEnumTypes()
|
||||
.get(0);
|
||||
}
|
||||
|
||||
|
||||
private static final LayoutEngine[] VALUES = values();
|
||||
|
||||
|
||||
public static LayoutEngine valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
|
||||
|
||||
if (desc.getType() != getDescriptor()) {
|
||||
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
|
||||
}
|
||||
if (desc.getIndex() == -1) {
|
||||
return UNRECOGNIZED;
|
||||
}
|
||||
return VALUES[desc.getIndex()];
|
||||
}
|
||||
|
||||
|
||||
private final int value;
|
||||
|
||||
|
||||
private LayoutEngine(int value) {
|
||||
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
// @@protoc_insertion_point(enum_scope:LayoutEngine)
|
||||
}
|
||||
|
||||
|
||||
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
|
||||
|
||||
return descriptor;
|
||||
}
|
||||
|
||||
|
||||
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
|
||||
|
||||
static {
|
||||
String[] descriptorData = {"\n\022LayoutEngine.proto*2\n\014LayoutEngine\022\r\n\t" + "ALGORITHM\020\000\022\006\n\002AI\020\001\022\013\n\007OUTLINE\020\002b\006proto3"};
|
||||
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
|
||||
descriptor.resolveAllFeaturesImmutable();
|
||||
}
|
||||
|
||||
// @@protoc_insertion_point(outer_class_scope)
|
||||
}
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
import java.io.Serializable;
|
||||
import java.util.Locale;
|
||||
|
||||
@Deprecated
|
||||
public enum NodeType implements Serializable {
|
||||
DOCUMENT,
|
||||
SECTION,
|
||||
|
||||
@ -0,0 +1,274 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.util.Locale;
|
||||
// Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
// NO CHECKED-IN PROTOBUF GENCODE
|
||||
// source: NodeType.proto
|
||||
// Protobuf Java Version: 4.27.1
|
||||
@SuppressWarnings("all")
|
||||
public final class NodeTypeProto {
|
||||
|
||||
private NodeTypeProto() {}
|
||||
|
||||
|
||||
static {
|
||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||
/* major= */ 4,
|
||||
/* minor= */ 27,
|
||||
/* patch= */ 1,
|
||||
/* suffix= */ "", NodeTypeProto.class.getName());
|
||||
}
|
||||
|
||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
|
||||
|
||||
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Protobuf enum {@code NodeType}
|
||||
*/
|
||||
public enum NodeType implements com.google.protobuf.ProtocolMessageEnum {
|
||||
/**
|
||||
* <code>DOCUMENT = 0;</code>
|
||||
*/
|
||||
DOCUMENT(0),
|
||||
/**
|
||||
* <code>SECTION = 1;</code>
|
||||
*/
|
||||
SECTION(1),
|
||||
/**
|
||||
* <code>SUPER_SECTION = 2;</code>
|
||||
*/
|
||||
SUPER_SECTION(2),
|
||||
/**
|
||||
* <code>HEADLINE = 3;</code>
|
||||
*/
|
||||
HEADLINE(3),
|
||||
/**
|
||||
* <code>PARAGRAPH = 4;</code>
|
||||
*/
|
||||
PARAGRAPH(4),
|
||||
/**
|
||||
* <code>TABLE = 5;</code>
|
||||
*/
|
||||
TABLE(5),
|
||||
/**
|
||||
* <code>TABLE_CELL = 6;</code>
|
||||
*/
|
||||
TABLE_CELL(6),
|
||||
/**
|
||||
* <code>IMAGE = 7;</code>
|
||||
*/
|
||||
IMAGE(7),
|
||||
/**
|
||||
* <code>HEADER = 8;</code>
|
||||
*/
|
||||
HEADER(8),
|
||||
/**
|
||||
* <code>FOOTER = 9;</code>
|
||||
*/
|
||||
FOOTER(9),
|
||||
UNRECOGNIZED(-1),
|
||||
;
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
|
||||
static {
|
||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||
/* major= */ 4,
|
||||
/* minor= */ 27,
|
||||
/* patch= */ 1,
|
||||
/* suffix= */ "", NodeType.class.getName());
|
||||
}
|
||||
|
||||
/**
|
||||
* <code>DOCUMENT = 0;</code>
|
||||
*/
|
||||
public static final int DOCUMENT_VALUE = 0;
|
||||
/**
|
||||
* <code>SECTION = 1;</code>
|
||||
*/
|
||||
public static final int SECTION_VALUE = 1;
|
||||
/**
|
||||
* <code>SUPER_SECTION = 2;</code>
|
||||
*/
|
||||
public static final int SUPER_SECTION_VALUE = 2;
|
||||
/**
|
||||
* <code>HEADLINE = 3;</code>
|
||||
*/
|
||||
public static final int HEADLINE_VALUE = 3;
|
||||
/**
|
||||
* <code>PARAGRAPH = 4;</code>
|
||||
*/
|
||||
public static final int PARAGRAPH_VALUE = 4;
|
||||
/**
|
||||
* <code>TABLE = 5;</code>
|
||||
*/
|
||||
public static final int TABLE_VALUE = 5;
|
||||
/**
|
||||
* <code>TABLE_CELL = 6;</code>
|
||||
*/
|
||||
public static final int TABLE_CELL_VALUE = 6;
|
||||
/**
|
||||
* <code>IMAGE = 7;</code>
|
||||
*/
|
||||
public static final int IMAGE_VALUE = 7;
|
||||
/**
|
||||
* <code>HEADER = 8;</code>
|
||||
*/
|
||||
public static final int HEADER_VALUE = 8;
|
||||
/**
|
||||
* <code>FOOTER = 9;</code>
|
||||
*/
|
||||
public static final int FOOTER_VALUE = 9;
|
||||
|
||||
|
||||
public final int getNumber() {
|
||||
|
||||
if (this == UNRECOGNIZED) {
|
||||
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param value The numeric wire value of the corresponding enum entry.
|
||||
* @return The enum associated with the given numeric wire value.
|
||||
* @deprecated Use {@link #forNumber(int)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public static NodeType valueOf(int value) {
|
||||
|
||||
return forNumber(value);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param value The numeric wire value of the corresponding enum entry.
|
||||
* @return The enum associated with the given numeric wire value.
|
||||
*/
|
||||
public static NodeType forNumber(int value) {
|
||||
|
||||
switch (value) {
|
||||
case 0:
|
||||
return DOCUMENT;
|
||||
case 1:
|
||||
return SECTION;
|
||||
case 2:
|
||||
return SUPER_SECTION;
|
||||
case 3:
|
||||
return HEADLINE;
|
||||
case 4:
|
||||
return PARAGRAPH;
|
||||
case 5:
|
||||
return TABLE;
|
||||
case 6:
|
||||
return TABLE_CELL;
|
||||
case 7:
|
||||
return IMAGE;
|
||||
case 8:
|
||||
return HEADER;
|
||||
case 9:
|
||||
return FOOTER;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static com.google.protobuf.Internal.EnumLiteMap<NodeType> internalGetValueMap() {
|
||||
|
||||
return internalValueMap;
|
||||
}
|
||||
|
||||
|
||||
private static final com.google.protobuf.Internal.EnumLiteMap<NodeType> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<NodeType>() {
|
||||
public NodeType findValueByNumber(int number) {
|
||||
|
||||
return NodeType.forNumber(number);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
|
||||
|
||||
if (this == UNRECOGNIZED) {
|
||||
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
|
||||
}
|
||||
return getDescriptor().getValues()
|
||||
.get(ordinal());
|
||||
}
|
||||
|
||||
|
||||
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
|
||||
|
||||
return getDescriptor();
|
||||
}
|
||||
|
||||
|
||||
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
|
||||
|
||||
return NodeTypeProto.getDescriptor().getEnumTypes()
|
||||
.get(0);
|
||||
}
|
||||
|
||||
|
||||
private static final NodeType[] VALUES = values();
|
||||
|
||||
|
||||
public static NodeType valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
|
||||
|
||||
if (desc.getType() != getDescriptor()) {
|
||||
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
|
||||
}
|
||||
if (desc.getIndex() == -1) {
|
||||
return UNRECOGNIZED;
|
||||
}
|
||||
return VALUES[desc.getIndex()];
|
||||
}
|
||||
|
||||
|
||||
private final int value;
|
||||
|
||||
|
||||
private NodeType(int value) {
|
||||
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
// @@protoc_insertion_point(enum_scope:NodeType)
|
||||
}
|
||||
|
||||
|
||||
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
|
||||
|
||||
return descriptor;
|
||||
}
|
||||
|
||||
|
||||
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
|
||||
|
||||
static {
|
||||
String[] descriptorData = {"\n\016NodeType.proto*\223\001\n\010NodeType\022\014\n\010DOCUMEN"
|
||||
+ "T\020\000\022\013\n\007SECTION\020\001\022\021\n\rSUPER_SECTION\020\002\022\014\n\010H"
|
||||
+ "EADLINE\020\003\022\r\n\tPARAGRAPH\020\004\022\t\n\005TABLE\020\005\022\016\n\nT"
|
||||
+ "ABLE_CELL\020\006\022\t\n\005IMAGE\020\007\022\n\n\006HEADER\020\010\022\n\n\006FO"
|
||||
+ "OTER\020\tb\006proto3"};
|
||||
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
|
||||
descriptor.resolveAllFeaturesImmutable();
|
||||
}
|
||||
|
||||
// @@protoc_insertion_point(outer_class_scope)
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
syntax = "proto3";
|
||||
|
||||
message AllDocumentPages {
|
||||
|
||||
repeated DocumentPage documentPages = 1;
|
||||
}
|
||||
|
||||
message DocumentPage {
|
||||
// The page number, starting with 1.
|
||||
int32 number = 1;
|
||||
|
||||
// The page height in PDF user units.
|
||||
int32 height = 2;
|
||||
|
||||
// The page width in PDF user units.
|
||||
int32 width = 3;
|
||||
|
||||
// The page rotation as specified by the PDF.
|
||||
int32 rotation = 4;
|
||||
}
|
||||
|
||||
@ -0,0 +1,25 @@
|
||||
syntax = "proto3";
|
||||
|
||||
message AllDocumentPositionData {
|
||||
|
||||
repeated DocumentPositionData documentPositionData = 1;
|
||||
}
|
||||
|
||||
message DocumentPositionData {
|
||||
// Identifier of the text block.
|
||||
int64 id = 1;
|
||||
|
||||
// For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate.
|
||||
// This is required due to the text and position coordinates not being equal.
|
||||
repeated int32 stringIdxToPositionIdx = 2;
|
||||
|
||||
// The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block.
|
||||
// The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner.
|
||||
// In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.
|
||||
repeated Position positions = 3;
|
||||
|
||||
// Definition of a BoundingBox that contains x, y, width, and height.
|
||||
message Position {
|
||||
repeated float value = 1;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
syntax = "proto3";
|
||||
|
||||
import "EntryData.proto";
|
||||
|
||||
message DocumentStructure {
|
||||
// The root EntryData represents the Document.
|
||||
EntryData root = 1;
|
||||
}
|
||||
@ -0,0 +1,29 @@
|
||||
syntax = "proto3";
|
||||
|
||||
message AllDocumentTextData {
|
||||
|
||||
repeated DocumentTextData documentTextData = 1;
|
||||
}
|
||||
|
||||
message DocumentTextData {
|
||||
// Identifier of the text block.
|
||||
int64 id = 1;
|
||||
|
||||
// The page the text block occurs on.
|
||||
int64 page = 2;
|
||||
|
||||
// The text of the text block.
|
||||
string searchText = 3;
|
||||
|
||||
// Each text block is assigned a number on a page, starting from 0.
|
||||
int32 numberOnPage = 4;
|
||||
|
||||
// The text blocks are ordered, this number represents the start of the text block as a string offset.
|
||||
int32 start = 5;
|
||||
|
||||
// The text blocks are ordered, this number represents the end of the text block as a string offset.
|
||||
int32 end = 6;
|
||||
|
||||
// The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.
|
||||
repeated int32 lineBreaks = 7;
|
||||
}
|
||||
@ -0,0 +1,27 @@
|
||||
syntax = "proto3";
|
||||
|
||||
import "LayoutEngine.proto";
|
||||
import "NodeType.proto";
|
||||
|
||||
message EntryData {
|
||||
// Type of the semantic node.
|
||||
NodeType type = 1;
|
||||
|
||||
// Specifies the position in the parsed tree structure.
|
||||
repeated int32 treeId = 2;
|
||||
|
||||
// Specifies the text block IDs associated with this semantic node.
|
||||
repeated int64 atomicBlockIds = 3;
|
||||
|
||||
// Specifies the pages this semantic node appears on.
|
||||
repeated int64 pageNumbers = 4;
|
||||
|
||||
// Some semantic nodes have additional information, this information is stored in this Map.
|
||||
map<string, string> properties = 5;
|
||||
|
||||
// All child Entries of this Entry.
|
||||
repeated EntryData children = 6;
|
||||
|
||||
// Describes the origin of the semantic node.
|
||||
repeated LayoutEngine engines = 7;
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
syntax = "proto3";
|
||||
|
||||
enum LayoutEngine {
|
||||
ALGORITHM = 0;
|
||||
AI = 1;
|
||||
OUTLINE = 2;
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
syntax = "proto3";
|
||||
|
||||
enum NodeType {
|
||||
DOCUMENT = 0;
|
||||
SECTION = 1;
|
||||
SUPER_SECTION = 2;
|
||||
HEADLINE = 3;
|
||||
PARAGRAPH = 4;
|
||||
TABLE = 5;
|
||||
TABLE_CELL = 6;
|
||||
IMAGE = 7;
|
||||
HEADER = 8;
|
||||
FOOTER = 9;
|
||||
}
|
||||
@ -16,8 +16,10 @@ dependencies {
|
||||
exclude("org.springframework.boot", "spring-boot-starter-security")
|
||||
exclude("org.springframework.boot", "spring-boot-starter-validation")
|
||||
}
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.45.0")
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
|
||||
exclude("com.iqser.red.commons", "storage-commons")
|
||||
}
|
||||
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||
|
||||
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
@ -34,4 +36,5 @@ dependencies {
|
||||
implementation("com.pdftron:PDFNet:10.11.0")
|
||||
implementation("org.apache.commons:commons-text:1.12.0")
|
||||
|
||||
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
|
||||
}
|
||||
|
||||
@ -20,7 +20,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
|
||||
@ -11,7 +11,9 @@ import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import org.springframework.core.task.TaskExecutor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
@ -39,6 +41,8 @@ public class LayoutParsingStorageService {
|
||||
private final StorageService storageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
private final TaskExecutor taskExecutor;
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
|
||||
public File getOriginFile(String storageId) throws IOException {
|
||||
@ -100,13 +104,35 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages());
|
||||
Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
documentData.getDocumentStructure());
|
||||
|
||||
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
documentData.getDocumentTextData());
|
||||
|
||||
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
documentData.getDocumentPositions());
|
||||
|
||||
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
documentData.getDocumentPages());
|
||||
|
||||
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
|
||||
|
||||
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
|
||||
@ -6,7 +6,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
@ -9,7 +9,7 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
|
||||
@ -12,8 +12,8 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
|
||||
@ -12,8 +12,8 @@ import java.util.Set;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
||||
|
||||
@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.DocumentTextData;
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -13,8 +14,8 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
@ -153,11 +154,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
.page(page)
|
||||
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
|
||||
.searchText(documentTextData.getSearchText())
|
||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed()
|
||||
.toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed()
|
||||
.toList())
|
||||
.positions(toRectangle2DList(documentPositionData.getPositions()))
|
||||
.lineBreaks(documentTextData.getLineBreaksList())
|
||||
.stringIdxToPositionIdx(documentPositionData.getStringIdxToPositionIdxList())
|
||||
.positions(toRectangle2DList(documentPositionData.getPositionsList()))
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
@ -171,6 +170,14 @@ public class AtomicTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
private static List<Rectangle2D> toRectangle2DList(List<Position> positions) {
|
||||
|
||||
return positions.stream()
|
||||
.map(pos -> (Rectangle2D) new Rectangle2D.Float(pos.getValue(0), pos.getValue(1), pos.getValue(2), pos.getValue(3)))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public CharSequence getLine(int lineNumber) {
|
||||
|
||||
if (lineNumber >= numberOfLines() || lineNumber < 0) {
|
||||
|
||||
@ -11,7 +11,7 @@ import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.text.similarity.LevenshteinDistance;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.DocumentTextData;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -9,11 +13,15 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
@ -32,44 +40,57 @@ public class DocumentDataMapper {
|
||||
public DocumentData toDocumentData(Document document) {
|
||||
|
||||
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
|
||||
.stream())
|
||||
.distinct()
|
||||
.map(DocumentDataMapper::toAtomicTextBlockData)
|
||||
.toList();
|
||||
|
||||
AllDocumentTextData allDocumentTextData = AllDocumentTextData.newBuilder().addAllDocumentTextData(documentTextData).build();
|
||||
|
||||
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
|
||||
.stream())
|
||||
.distinct()
|
||||
.map(DocumentDataMapper::toAtomicPositionBlockData)
|
||||
.toList();
|
||||
|
||||
Set<Long> nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet());
|
||||
AllDocumentPositionData allDocumentPositionData = AllDocumentPositionData.newBuilder().addAllDocumentPositionData(atomicPositionBlockData).build();
|
||||
|
||||
List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList();
|
||||
DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
|
||||
Set<Long> nonEmptyTextBlocks = documentTextData.stream()
|
||||
.mapToLong(DocumentTextData::getId).boxed()
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
List<DocumentPage> documentPageData = document.getPages()
|
||||
.stream()
|
||||
.map(DocumentDataMapper::toPageData)
|
||||
.toList();
|
||||
|
||||
AllDocumentPages allDocumentPages = AllDocumentPages.newBuilder().addAllDocumentPages(documentPageData).build();
|
||||
DocumentStructureWrapper tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
|
||||
return DocumentData.builder()
|
||||
.documentTextData(documentTextData.toArray(new DocumentTextData[0]))
|
||||
.documentPositions(atomicPositionBlockData.toArray(new DocumentPositionData[0]))
|
||||
.documentPages(documentPageData.toArray(new DocumentPage[0]))
|
||||
.documentStructure(tableOfContentsData)
|
||||
.documentTextData(allDocumentTextData)
|
||||
.documentPositions(allDocumentPositionData)
|
||||
.documentPages(allDocumentPages)
|
||||
.documentStructureWrapper(tableOfContentsData)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private DocumentStructure toDocumentTreeData(DocumentTree documentTree) {
|
||||
private DocumentStructureWrapper toDocumentTreeData(DocumentTree documentTree) {
|
||||
|
||||
return new DocumentStructure(toEntryData(documentTree.getRoot()));
|
||||
return new DocumentStructureWrapper(DocumentStructure.newBuilder().setRoot(toEntryData(documentTree.getRoot())).build());
|
||||
}
|
||||
|
||||
|
||||
private DocumentStructure.EntryData toEntryData(DocumentTree.Entry entry) {
|
||||
private EntryData toEntryData(DocumentTree.Entry entry) {
|
||||
|
||||
Long[] atomicTextBlocks;
|
||||
List<Long> atomicTextBlocks;
|
||||
|
||||
if (entry.getNode().isLeaf()) {
|
||||
atomicTextBlocks = toAtomicTextBlockIds(entry.getNode().getLeafTextBlock());
|
||||
} else {
|
||||
atomicTextBlocks = new Long[]{};
|
||||
atomicTextBlocks = new ArrayList<>();
|
||||
}
|
||||
|
||||
Map<String, String> properties = switch (entry.getType()) {
|
||||
@ -81,77 +102,81 @@ public class DocumentDataMapper {
|
||||
default -> new HashMap<>();
|
||||
};
|
||||
|
||||
DocumentStructure.EntryData.EntryDataBuilder documentBuilder = DocumentStructure.EntryData.builder()
|
||||
.treeId(toPrimitiveIntArray(entry.getTreeId()))
|
||||
.children(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList())
|
||||
.type(entry.getType())
|
||||
.atomicBlockIds(atomicTextBlocks)
|
||||
.pageNumbers(entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new))
|
||||
.properties(properties);
|
||||
var documentBuilder = EntryData.newBuilder()
|
||||
.addAllTreeId(entry.getTreeId())
|
||||
.addAllChildren(entry.getChildren()
|
||||
.stream()
|
||||
.map(DocumentDataMapper::toEntryData)
|
||||
.toList())
|
||||
.setType(entry.getType())
|
||||
.addAllAtomicBlockIds(atomicTextBlocks)
|
||||
.addAllPageNumbers(entry.getNode().getPages()
|
||||
.stream()
|
||||
.map(Page::getNumber)
|
||||
.map(Integer::longValue)
|
||||
.toList())
|
||||
.putAllProperties(properties);
|
||||
if (entry.getNode() != null) {
|
||||
documentBuilder.engines(entry.getNode().getEngines());
|
||||
documentBuilder.addAllEngines(entry.getNode().getEngines());
|
||||
} else {
|
||||
documentBuilder.engines(new HashSet<>(Set.of(LayoutEngine.ALGORITHM)));
|
||||
documentBuilder.addAllEngines(new HashSet<>(Set.of(LayoutEngine.ALGORITHM)));
|
||||
}
|
||||
return documentBuilder.build();
|
||||
}
|
||||
|
||||
|
||||
private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
|
||||
private List<Long> toAtomicTextBlockIds(TextBlock textBlock) {
|
||||
|
||||
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
|
||||
return textBlock.getAtomicTextBlocks()
|
||||
.stream()
|
||||
.map(AtomicTextBlock::getId)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private DocumentPage toPageData(Page p) {
|
||||
|
||||
return DocumentPage.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).build();
|
||||
return DocumentPage.newBuilder().setRotation(p.getRotation()).setHeight(p.getHeight()).setWidth(p.getWidth()).setNumber(p.getNumber()).build();
|
||||
}
|
||||
|
||||
|
||||
private DocumentTextData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return DocumentTextData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.page(atomicTextBlock.getPage().getNumber().longValue())
|
||||
.searchText(atomicTextBlock.getSearchText())
|
||||
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
||||
.start(atomicTextBlock.getTextRange().start())
|
||||
.end(atomicTextBlock.getTextRange().end())
|
||||
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
|
||||
return DocumentTextData.newBuilder()
|
||||
.setId(atomicTextBlock.getId())
|
||||
.setPage(atomicTextBlock.getPage().getNumber().longValue())
|
||||
.setSearchText(atomicTextBlock.getSearchText())
|
||||
.setNumberOnPage(atomicTextBlock.getNumberOnPage())
|
||||
.setStart(atomicTextBlock.getTextRange().start())
|
||||
.setEnd(atomicTextBlock.getTextRange().end())
|
||||
.addAllLineBreaks(atomicTextBlock.getLineBreaks())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private DocumentPositionData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return DocumentPositionData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
|
||||
.stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx()))
|
||||
return DocumentPositionData.newBuilder()
|
||||
.setId(atomicTextBlock.getId())
|
||||
.addAllPositions(toPositions(atomicTextBlock.getPositions()))
|
||||
.addAllStringIdxToPositionIdx(atomicTextBlock.getStringIdxToPositionIdx())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static float[][] toPrimitiveFloatMatrix(List<Rectangle2D> positions) {
|
||||
private static List<Position> toPositions(List<Rectangle2D> rects) {
|
||||
|
||||
float[][] positionMatrix = new float[positions.size()][];
|
||||
for (int i = 0; i < positions.size(); i++) {
|
||||
positionMatrix[i] = toArray(positions.get(i));
|
||||
List<Position> positions = new ArrayList<>();
|
||||
for (Rectangle2D rect : rects) {
|
||||
positions.add(toPosition(rect));
|
||||
}
|
||||
return positionMatrix;
|
||||
return positions;
|
||||
}
|
||||
|
||||
|
||||
private static float[] toArray(Rectangle2D positions) {
|
||||
private static Position toPosition(Rectangle2D rect) {
|
||||
|
||||
return new float[]{(float) positions.getMinX(), (float) positions.getMinY(), (float) positions.getWidth(), (float) positions.getHeight()};
|
||||
}
|
||||
|
||||
|
||||
private int[] toPrimitiveIntArray(List<Integer> list) {
|
||||
|
||||
return list.stream().mapToInt(Integer::intValue).toArray();
|
||||
return Position.newBuilder().addValue((float) rect.getMinX()).addValue((float) rect.getMinY()).addValue((float) rect.getWidth()).addValue((float) rect.getHeight()).build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -8,10 +8,10 @@ import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
@ -41,27 +41,29 @@ public class DocumentGraphMapper {
|
||||
DocumentTree documentTree = new DocumentTree(document);
|
||||
Context context = new Context(documentData, documentTree);
|
||||
|
||||
context.pages.addAll(Arrays.stream(documentData.getDocumentPages())
|
||||
context.pages.addAll(documentData.getDocumentPages().getDocumentPagesList()
|
||||
.stream()
|
||||
.map(DocumentGraphMapper::buildPage)
|
||||
.toList());
|
||||
|
||||
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
|
||||
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildrenList(), context));
|
||||
|
||||
document.setDocumentTree(context.documentTree);
|
||||
document.setPages(new HashSet<>(context.pages));
|
||||
document.setNumberOfPages(documentData.getDocumentPages().length);
|
||||
document.setNumberOfPages(documentData.getDocumentPages().getDocumentPagesCount());
|
||||
|
||||
document.setTextBlock(document.getTextBlock());
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
private List<DocumentTree.Entry> buildEntries(List<DocumentStructure.EntryData> entries, Context context) {
|
||||
private List<DocumentTree.Entry> buildEntries(List<EntryData> entries, Context context) {
|
||||
|
||||
List<DocumentTree.Entry> newEntries = new LinkedList<>();
|
||||
for (DocumentStructure.EntryData entryData : entries) {
|
||||
for (EntryData entryData : entries) {
|
||||
|
||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
|
||||
List<Page> pages = entryData.getPageNumbersList()
|
||||
.stream()
|
||||
.map(pageNumber -> getPage(pageNumber, context))
|
||||
.toList();
|
||||
|
||||
@ -74,12 +76,12 @@ public class DocumentGraphMapper {
|
||||
case FOOTER -> buildFooter(context);
|
||||
case TABLE -> buildTable(context, entryData.getProperties());
|
||||
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
|
||||
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers());
|
||||
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList());
|
||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
||||
};
|
||||
|
||||
if (entryData.getAtomicBlockIds().length > 0) {
|
||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
||||
if (entryData.getAtomicBlockIdsCount() > 0) {
|
||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIdsList(), context, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
switch (entryData.getType()) {
|
||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||
@ -89,11 +91,10 @@ public class DocumentGraphMapper {
|
||||
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
|
||||
}
|
||||
}
|
||||
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
|
||||
.toList();
|
||||
List<Integer> treeId = entryData.getTreeIdList();
|
||||
node.setTreeId(treeId);
|
||||
|
||||
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
||||
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildrenList(), context)).node(node).build());
|
||||
}
|
||||
return newEntries;
|
||||
}
|
||||
@ -105,10 +106,10 @@ public class DocumentGraphMapper {
|
||||
}
|
||||
|
||||
|
||||
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
|
||||
private Image buildImage(Context context, Map<String, String> properties, List<Long> pageNumbers) {
|
||||
|
||||
assert pageNumbers.length == 1;
|
||||
Page page = getPage(pageNumbers[0], context);
|
||||
assert pageNumbers.size() == 1;
|
||||
Page page = getPage(pageNumbers.get(0), context);
|
||||
var builder = Image.builder();
|
||||
PropertiesMapper.parseImageProperties(properties, builder);
|
||||
return builder.documentTree(context.documentTree).page(page).build();
|
||||
@ -161,7 +162,7 @@ public class DocumentGraphMapper {
|
||||
|
||||
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
|
||||
|
||||
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
|
||||
var unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
|
||||
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
|
||||
return duplicatedParagraph;
|
||||
|
||||
@ -171,9 +172,9 @@ public class DocumentGraphMapper {
|
||||
}
|
||||
|
||||
|
||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
private TextBlock toTextBlock(List<Long> atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
|
||||
return Arrays.stream(atomicTextBlockIds)
|
||||
return atomicTextBlockIds.stream()
|
||||
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
@ -181,10 +182,13 @@ public class DocumentGraphMapper {
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
||||
|
||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.getDocumentTextDataList()
|
||||
.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.atomicPositionBlockData.getDocumentPositionDataList()
|
||||
.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||
getPage(context.documentTextDataBlockData.getDocumentTextDataList()
|
||||
.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||
}
|
||||
|
||||
|
||||
@ -207,18 +211,16 @@ public class DocumentGraphMapper {
|
||||
|
||||
private final DocumentTree documentTree;
|
||||
private final List<Page> pages;
|
||||
private final List<DocumentTextData> documentTextDataBlockData;
|
||||
private final List<DocumentPositionData> atomicPositionBlockData;
|
||||
private final AllDocumentTextData documentTextDataBlockData;
|
||||
private final AllDocumentPositionData atomicPositionBlockData;
|
||||
|
||||
|
||||
Context(DocumentData documentData, DocumentTree documentTree) {
|
||||
|
||||
this.documentTree = documentTree;
|
||||
this.pages = new LinkedList<>();
|
||||
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData())
|
||||
.toList();
|
||||
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions())
|
||||
.toList();
|
||||
this.documentTextDataBlockData = documentData.getDocumentTextData();
|
||||
this.atomicPositionBlockData = documentData.getDocumentPositions();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -3,10 +3,11 @@ package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
@ -20,11 +21,11 @@ public class PropertiesMapper {
|
||||
public static Map<String, String> buildImageProperties(Image image) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put(DocumentStructure.ImageProperties.IMAGE_TYPE, image.getImageType().toString());
|
||||
properties.put(DocumentStructure.ImageProperties.TRANSPARENT, String.valueOf(image.isTransparent()));
|
||||
properties.put(DocumentStructure.ImageProperties.POSITION, toString(image.getPosition()));
|
||||
properties.put(DocumentStructure.ImageProperties.ID, image.getId());
|
||||
properties.put(DocumentStructure.ImageProperties.REPRESENTATION_HASH, image.getRepresentationHash());
|
||||
properties.put(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE, image.getImageType().toString());
|
||||
properties.put(DocumentStructureWrapper.ImageProperties.TRANSPARENT, String.valueOf(image.isTransparent()));
|
||||
properties.put(DocumentStructureWrapper.ImageProperties.POSITION, toString(image.getPosition()));
|
||||
properties.put(DocumentStructureWrapper.ImageProperties.ID, image.getId());
|
||||
properties.put(DocumentStructureWrapper.ImageProperties.REPRESENTATION_HASH, image.getRepresentationHash());
|
||||
return properties;
|
||||
}
|
||||
|
||||
@ -32,15 +33,19 @@ public class PropertiesMapper {
|
||||
public static Map<String, String> buildTableCellProperties(TableCell tableCell) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put(DocumentStructure.TableCellProperties.ROW, String.valueOf(tableCell.getRow()));
|
||||
properties.put(DocumentStructure.TableCellProperties.COL, String.valueOf(tableCell.getCol()));
|
||||
properties.put(DocumentStructure.TableCellProperties.HEADER, String.valueOf(tableCell.isHeader()));
|
||||
properties.put(DocumentStructureWrapper.TableCellProperties.ROW, String.valueOf(tableCell.getRow()));
|
||||
properties.put(DocumentStructureWrapper.TableCellProperties.COL, String.valueOf(tableCell.getCol()));
|
||||
properties.put(DocumentStructureWrapper.TableCellProperties.HEADER, String.valueOf(tableCell.isHeader()));
|
||||
|
||||
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
|
||||
throw new IllegalArgumentException("TableCell can only occur on a single page!");
|
||||
}
|
||||
String bBoxString = toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
|
||||
properties.put(DocumentStructure.TableCellProperties.B_BOX, bBoxString);
|
||||
String bBoxString = toString(tableCell.getBBox()
|
||||
.get(tableCell.getPages()
|
||||
.stream()
|
||||
.findFirst()
|
||||
.get()));
|
||||
properties.put(DocumentStructureWrapper.TableCellProperties.B_BOX, bBoxString);
|
||||
|
||||
return properties;
|
||||
}
|
||||
@ -49,59 +54,62 @@ public class PropertiesMapper {
|
||||
public static Map<String, String> buildTableProperties(Table table) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put(DocumentStructure.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows()));
|
||||
properties.put(DocumentStructure.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols()));
|
||||
properties.put(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows()));
|
||||
properties.put(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols()));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
|
||||
|
||||
builder.imageType(parseImageType(properties.get(DocumentStructure.ImageProperties.IMAGE_TYPE)));
|
||||
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructure.ImageProperties.TRANSPARENT)));
|
||||
builder.position(DocumentStructure.parseRectangle2D(properties.get(DocumentStructure.ImageProperties.POSITION)));
|
||||
builder.imageType(parseImageType(properties.get(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE)));
|
||||
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.ImageProperties.TRANSPARENT)));
|
||||
builder.position(DocumentStructureWrapper.parseRectangle2D(properties.get(DocumentStructureWrapper.ImageProperties.POSITION)));
|
||||
}
|
||||
|
||||
|
||||
public static void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
|
||||
|
||||
builder.row(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.ROW)));
|
||||
builder.col(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.COL)));
|
||||
builder.header(Boolean.parseBoolean(properties.get(DocumentStructure.TableCellProperties.HEADER)));
|
||||
builder.bBox(DocumentStructure.parseRectangle2D(properties.get(DocumentStructure.TableCellProperties.B_BOX)));
|
||||
builder.row(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.ROW)));
|
||||
builder.col(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.COL)));
|
||||
builder.header(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.TableCellProperties.HEADER)));
|
||||
builder.bBox(DocumentStructureWrapper.parseRectangle2D(properties.get(DocumentStructureWrapper.TableCellProperties.B_BOX)));
|
||||
}
|
||||
|
||||
|
||||
public static void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
|
||||
|
||||
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS)));
|
||||
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS)));
|
||||
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS)));
|
||||
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS)));
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, String> buildDuplicateParagraphProperties(DuplicatedParagraph duplicatedParagraph) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID, Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock())));
|
||||
properties.put(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID,
|
||||
Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock())));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static boolean isDuplicateParagraph(Map<String, String> properties) {
|
||||
|
||||
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
|
||||
return properties.containsKey(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
|
||||
}
|
||||
|
||||
|
||||
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
|
||||
public static List<Long> getUnsortedTextblockIds(Map<String, String> properties) {
|
||||
|
||||
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
|
||||
return toLongList(properties.get(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
|
||||
}
|
||||
|
||||
|
||||
public static Long[] toLongArray(String ids) {
|
||||
public static List<Long> toLongList(String ids) {
|
||||
|
||||
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
|
||||
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
|
||||
.map(Long::valueOf)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -120,20 +128,23 @@ public class PropertiesMapper {
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return String.format(Locale.US,
|
||||
"%f%s%f%s%f%s%f",
|
||||
rectangle2D.getX(),
|
||||
DocumentStructure.RECTANGLE_DELIMITER,
|
||||
rectangle2D.getY(),
|
||||
DocumentStructure.RECTANGLE_DELIMITER,
|
||||
rectangle2D.getWidth(),
|
||||
DocumentStructure.RECTANGLE_DELIMITER,
|
||||
rectangle2D.getHeight());
|
||||
"%f%s%f%s%f%s%f",
|
||||
rectangle2D.getX(),
|
||||
DocumentStructureWrapper.RECTANGLE_DELIMITER,
|
||||
rectangle2D.getY(),
|
||||
DocumentStructureWrapper.RECTANGLE_DELIMITER,
|
||||
rectangle2D.getWidth(),
|
||||
DocumentStructureWrapper.RECTANGLE_DELIMITER,
|
||||
rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
private static Long[] toAtomicTextBlockIds(TextBlock textBlock) {
|
||||
|
||||
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
|
||||
return textBlock.getAtomicTextBlocks()
|
||||
.stream()
|
||||
.map(AtomicTextBlock::getId)
|
||||
.toArray(Long[]::new);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -6,7 +6,7 @@ import java.util.Locale;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ParagraphData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
|
||||
@ -14,7 +14,7 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
@ -117,6 +117,7 @@ public class PdfVisualisationUtility {
|
||||
case TABLE -> Color.ORANGE;
|
||||
case TABLE_CELL -> Color.GRAY;
|
||||
case IMAGE -> Color.MAGENTA;
|
||||
case UNRECOGNIZED -> Color.PINK;
|
||||
}).build();
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,42 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import com.google.protobuf.Message;
|
||||
import com.google.protobuf.MessageOrBuilder;
|
||||
import com.google.protobuf.Struct;
|
||||
import com.google.protobuf.util.JsonFormat;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ProtobufUtil {
|
||||
|
||||
public static String toJson(MessageOrBuilder messageOrBuilder) throws IOException {
|
||||
return JsonFormat.printer().print(messageOrBuilder);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static Message fromJson(String json) throws IOException {
|
||||
Message.Builder structBuilder = Struct.newBuilder();
|
||||
JsonFormat.parser().ignoringUnknownFields().merge(json, structBuilder);
|
||||
return structBuilder.build();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public <T extends Message> File serializeToTempFile(T any) {
|
||||
var tempFile = File.createTempFile("storage-protobuf", ".data");
|
||||
|
||||
try (var fos = new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile)))) {
|
||||
any.writeTo(fos);
|
||||
return tempFile;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -16,7 +16,7 @@ import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
@ -125,7 +125,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxMap = section.getBBox();
|
||||
|
||||
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeType.SECTION)
|
||||
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeTypeProto.NodeType.SECTION)
|
||||
.toList();
|
||||
Integer maxChildDepth = subSections.stream()
|
||||
.map(node -> node.getTreeId().size())
|
||||
|
||||
@ -29,7 +29,7 @@ dependencies {
|
||||
implementation(project(":layoutparser-service-processor"))
|
||||
implementation(project(":layoutparser-service-internal-api"))
|
||||
|
||||
implementation("com.iqser.red.commons:storage-commons:2.45.0")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
|
||||
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
|
||||
implementation("com.knecon.fforesight:lifecycle-commons:0.6.0")
|
||||
@ -38,11 +38,14 @@ dependencies {
|
||||
implementation("com.amazonaws:aws-java-sdk-s3:1.12.536")
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||
implementation("ch.qos.logback:logback-classic")
|
||||
|
||||
implementation("com.pdftron:PDFNet:10.11.0")
|
||||
|
||||
// for integration testing only
|
||||
testImplementation(project(":viewer-doc-processor"))
|
||||
testImplementation(project(":layoutparser-service-internal-api"))
|
||||
testImplementation("com.google.protobuf:protobuf-java-util:4.27.1")
|
||||
|
||||
testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
|
||||
testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}")
|
||||
|
||||
@ -70,7 +70,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
var stem = Path.of("/tmp/DocumentGraphJsonWritingTest");
|
||||
stem.toFile().mkdirs();
|
||||
var tmpFilePath = stem.resolve(filename.getFileName());
|
||||
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_structure" + ".json")), documentData.getDocumentStructure());
|
||||
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_structure" + ".json")), documentData.getDocumentStructureWrapper());
|
||||
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_text" + ".json")), documentData.getDocumentTextData());
|
||||
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_positions" + ".json")), documentData.getDocumentPositions());
|
||||
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_pages" + ".json")), documentData.getDocumentPages());
|
||||
|
||||
@ -6,11 +6,13 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
@ -32,21 +34,21 @@ public class DocumentGraphMappingTest extends BuildDocumentTest {
|
||||
Document document = buildGraph(filename);
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(document);
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", documentData.getDocumentPages());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTextData());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".json", documentData.getDocumentPositions());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", documentData.getDocumentStructure());
|
||||
storageService.storeProtoObject(TenantContext.getTenantId(), filename + "_PAGES" + ".proto", documentData.getDocumentPages());
|
||||
storageService.storeProtoObject(TenantContext.getTenantId(), filename + "_TEXT" + ".proto", documentData.getDocumentTextData());
|
||||
storageService.storeProtoObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".proto", documentData.getDocumentPositions());
|
||||
storageService.storeProtoObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".proto", documentData.getDocumentStructure());
|
||||
|
||||
DocumentPage[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", DocumentPage[].class);
|
||||
DocumentTextData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentTextData[].class);
|
||||
DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
filename + "_POSITIONS" + ".json",
|
||||
DocumentPositionData[].class);
|
||||
DocumentStructure documentTreeData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", DocumentStructure.class);
|
||||
AllDocumentPages pageData = storageService.readProtoObject(TenantContext.getTenantId(), filename + "_PAGES" + ".proto", AllDocumentPages.parser());
|
||||
AllDocumentTextData atomicTextBlockData = storageService.readProtoObject(TenantContext.getTenantId(), filename + "_TEXT" + ".proto", AllDocumentTextData.parser());
|
||||
AllDocumentPositionData atomicPositionBlockData = storageService.readProtoObject(TenantContext.getTenantId(),
|
||||
filename + "_POSITIONS" + ".proto",
|
||||
AllDocumentPositionData.parser());
|
||||
DocumentStructure documentTreeData = storageService.readProtoObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".proto", DocumentStructure.parser());
|
||||
|
||||
DocumentData documentData2 = DocumentData.builder()
|
||||
.documentPages(pageData)
|
||||
.documentStructure(documentTreeData)
|
||||
.documentStructureWrapper(new DocumentStructureWrapper(documentTreeData))
|
||||
.documentTextData(atomicTextBlockData)
|
||||
.documentPositions(atomicPositionBlockData)
|
||||
.build();
|
||||
@ -73,10 +75,9 @@ public class DocumentGraphMappingTest extends BuildDocumentTest {
|
||||
|
||||
private static boolean allTablesHavePositiveNumberOfRowsAndColumns(DocumentData documentData) {
|
||||
|
||||
return documentData.getDocumentStructure()
|
||||
.streamAllEntries()
|
||||
return documentData.getDocumentStructureWrapper().streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(EntryData::getPropertiesMap)
|
||||
.map(properties -> {
|
||||
var builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
|
||||
@ -17,8 +17,9 @@ import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
@ -119,7 +120,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
Map.of("file",filename.toFile().toString())));
|
||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) {
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructureWrapper(), documentDataAfter.getDocumentStructureWrapper())) {
|
||||
String tmpFileNameBefore = "/tmp/before." + filename.getFileName().toString();
|
||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
|
||||
@ -136,11 +137,11 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2) {
|
||||
private boolean compareStructures(DocumentStructureWrapper structure1, DocumentStructureWrapper structure2) {
|
||||
|
||||
List<Table> listStructure1 = structure1.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(EntryData::getPropertiesMap)
|
||||
.map(properties -> {
|
||||
var builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
@ -150,7 +151,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
|
||||
List<Table> listStructure2 = structure2.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(EntryData::getPropertiesMap)
|
||||
.map(properties -> {
|
||||
var builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
|
||||
@ -17,7 +17,7 @@ import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
@ -233,6 +233,7 @@ public class PdfDraw {
|
||||
case TABLE -> Color.ORANGE;
|
||||
case TABLE_CELL -> Color.GRAY;
|
||||
case IMAGE -> Color.MAGENTA;
|
||||
case UNRECOGNIZED -> Color.PINK;
|
||||
}).build();
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user