Compare commits
3 Commits
main
...
RED-9139-n
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4b86307936 | ||
|
|
ce41014d4b | ||
|
|
e6cd889444 |
@ -7,5 +7,5 @@ description = "layoutparser-service-internal-api"
|
|||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
|
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
|
||||||
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
|
api("com.google.protobuf:protobuf-java-util:4.28.3")
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,16 +1,14 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
|
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.ObjectStreamException;
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
|
||||||
|
|
||||||
import io.swagger.v3.oas.annotations.media.Schema;
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,193 +1,177 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
|
||||||
|
|
||||||
// Generated by the protocol buffer compiler. DO NOT EDIT!
|
// Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||||
// NO CHECKED-IN PROTOBUF GENCODE
|
// NO CHECKED-IN PROTOBUF GENCODE
|
||||||
// source: LayoutEngine.proto
|
// source: LayoutEngine.proto
|
||||||
// Protobuf Java Version: 4.27.1
|
// Protobuf Java Version: 4.28.3
|
||||||
@SuppressWarnings("all")
|
|
||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
public final class LayoutEngineProto {
|
public final class LayoutEngineProto {
|
||||||
|
private LayoutEngineProto() {}
|
||||||
|
static {
|
||||||
|
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
|
||||||
|
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||||
|
/* major= */ 4,
|
||||||
|
/* minor= */ 28,
|
||||||
|
/* patch= */ 3,
|
||||||
|
/* suffix= */ "",
|
||||||
|
LayoutEngineProto.class.getName());
|
||||||
|
}
|
||||||
|
public static void registerAllExtensions(
|
||||||
|
com.google.protobuf.ExtensionRegistryLite registry) {
|
||||||
|
}
|
||||||
|
|
||||||
private LayoutEngineProto() {}
|
public static void registerAllExtensions(
|
||||||
|
com.google.protobuf.ExtensionRegistry registry) {
|
||||||
|
registerAllExtensions(
|
||||||
|
(com.google.protobuf.ExtensionRegistryLite) registry);
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Protobuf enum {@code LayoutEngine}
|
||||||
|
*/
|
||||||
|
public enum LayoutEngine
|
||||||
|
implements com.google.protobuf.ProtocolMessageEnum {
|
||||||
|
/**
|
||||||
|
* <code>ALGORITHM = 0;</code>
|
||||||
|
*/
|
||||||
|
ALGORITHM(0),
|
||||||
|
/**
|
||||||
|
* <code>AI = 1;</code>
|
||||||
|
*/
|
||||||
|
AI(1),
|
||||||
|
/**
|
||||||
|
* <code>OUTLINE = 2;</code>
|
||||||
|
*/
|
||||||
|
OUTLINE(2),
|
||||||
|
UNRECOGNIZED(-1),
|
||||||
|
;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
|
||||||
/* major= */ 4,
|
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||||
/* minor= */ 27,
|
/* major= */ 4,
|
||||||
/* patch= */ 1,
|
/* minor= */ 28,
|
||||||
/* suffix= */ "", LayoutEngineProto.class.getName());
|
/* patch= */ 3,
|
||||||
|
/* suffix= */ "",
|
||||||
|
LayoutEngine.class.getName());
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* <code>ALGORITHM = 0;</code>
|
||||||
|
*/
|
||||||
|
public static final int ALGORITHM_VALUE = 0;
|
||||||
|
/**
|
||||||
|
* <code>AI = 1;</code>
|
||||||
|
*/
|
||||||
|
public static final int AI_VALUE = 1;
|
||||||
|
/**
|
||||||
|
* <code>OUTLINE = 2;</code>
|
||||||
|
*/
|
||||||
|
public static final int OUTLINE_VALUE = 2;
|
||||||
|
|
||||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
|
|
||||||
|
|
||||||
|
public final int getNumber() {
|
||||||
|
if (this == UNRECOGNIZED) {
|
||||||
|
throw new java.lang.IllegalArgumentException(
|
||||||
|
"Can't get the number of an unknown enum value.");
|
||||||
|
}
|
||||||
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
|
|
||||||
|
|
||||||
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Protobuf enum {@code LayoutEngine}
|
* @param value The numeric wire value of the corresponding enum entry.
|
||||||
|
* @return The enum associated with the given numeric wire value.
|
||||||
|
* @deprecated Use {@link #forNumber(int)} instead.
|
||||||
*/
|
*/
|
||||||
public enum LayoutEngine implements com.google.protobuf.ProtocolMessageEnum {
|
@java.lang.Deprecated
|
||||||
/**
|
public static LayoutEngine valueOf(int value) {
|
||||||
* <code>ALGORITHM = 0;</code>
|
return forNumber(value);
|
||||||
*/
|
}
|
||||||
ALGORITHM(0),
|
|
||||||
/**
|
|
||||||
* <code>AI = 1;</code>
|
|
||||||
*/
|
|
||||||
AI(1),
|
|
||||||
/**
|
|
||||||
* <code>OUTLINE = 2;</code>
|
|
||||||
*/
|
|
||||||
OUTLINE(2),
|
|
||||||
UNRECOGNIZED(-1),
|
|
||||||
;
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param value The numeric wire value of the corresponding enum entry.
|
||||||
|
* @return The enum associated with the given numeric wire value.
|
||||||
|
*/
|
||||||
|
public static LayoutEngine forNumber(int value) {
|
||||||
|
switch (value) {
|
||||||
|
case 0: return ALGORITHM;
|
||||||
|
case 1: return AI;
|
||||||
|
case 2: return OUTLINE;
|
||||||
|
default: return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static {
|
public static com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>
|
||||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
internalGetValueMap() {
|
||||||
/* major= */ 4,
|
return internalValueMap;
|
||||||
/* minor= */ 27,
|
}
|
||||||
/* patch= */ 1,
|
private static final com.google.protobuf.Internal.EnumLiteMap<
|
||||||
/* suffix= */ "", LayoutEngine.class.getName());
|
LayoutEngine> internalValueMap =
|
||||||
}
|
new com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>() {
|
||||||
|
|
||||||
/**
|
|
||||||
* <code>ALGORITHM = 0;</code>
|
|
||||||
*/
|
|
||||||
public static final int ALGORITHM_VALUE = 0;
|
|
||||||
/**
|
|
||||||
* <code>AI = 1;</code>
|
|
||||||
*/
|
|
||||||
public static final int AI_VALUE = 1;
|
|
||||||
/**
|
|
||||||
* <code>OUTLINE = 2;</code>
|
|
||||||
*/
|
|
||||||
public static final int OUTLINE_VALUE = 2;
|
|
||||||
|
|
||||||
|
|
||||||
public final int getNumber() {
|
|
||||||
|
|
||||||
if (this == UNRECOGNIZED) {
|
|
||||||
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
|
|
||||||
}
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param value The numeric wire value of the corresponding enum entry.
|
|
||||||
* @return The enum associated with the given numeric wire value.
|
|
||||||
* @deprecated Use {@link #forNumber(int)} instead.
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public static LayoutEngine valueOf(int value) {
|
|
||||||
|
|
||||||
return forNumber(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param value The numeric wire value of the corresponding enum entry.
|
|
||||||
* @return The enum associated with the given numeric wire value.
|
|
||||||
*/
|
|
||||||
public static LayoutEngine forNumber(int value) {
|
|
||||||
|
|
||||||
switch (value) {
|
|
||||||
case 0:
|
|
||||||
return ALGORITHM;
|
|
||||||
case 1:
|
|
||||||
return AI;
|
|
||||||
case 2:
|
|
||||||
return OUTLINE;
|
|
||||||
default:
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalGetValueMap() {
|
|
||||||
|
|
||||||
return internalValueMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static final com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>() {
|
|
||||||
public LayoutEngine findValueByNumber(int number) {
|
public LayoutEngine findValueByNumber(int number) {
|
||||||
|
return LayoutEngine.forNumber(number);
|
||||||
return LayoutEngine.forNumber(number);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
public final com.google.protobuf.Descriptors.EnumValueDescriptor
|
||||||
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
|
getValueDescriptor() {
|
||||||
|
if (this == UNRECOGNIZED) {
|
||||||
if (this == UNRECOGNIZED) {
|
throw new java.lang.IllegalStateException(
|
||||||
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
|
"Can't get the descriptor of an unrecognized enum value.");
|
||||||
}
|
}
|
||||||
return getDescriptor().getValues()
|
return getDescriptor().getValues().get(ordinal());
|
||||||
.get(ordinal());
|
}
|
||||||
}
|
public final com.google.protobuf.Descriptors.EnumDescriptor
|
||||||
|
getDescriptorForType() {
|
||||||
|
return getDescriptor();
|
||||||
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
|
}
|
||||||
|
public static final com.google.protobuf.Descriptors.EnumDescriptor
|
||||||
return getDescriptor();
|
getDescriptor() {
|
||||||
}
|
return com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.getDescriptor().getEnumTypes().get(0);
|
||||||
|
|
||||||
|
|
||||||
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
|
|
||||||
|
|
||||||
return LayoutEngineProto.getDescriptor().getEnumTypes()
|
|
||||||
.get(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static final LayoutEngine[] VALUES = values();
|
|
||||||
|
|
||||||
|
|
||||||
public static LayoutEngine valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
|
|
||||||
|
|
||||||
if (desc.getType() != getDescriptor()) {
|
|
||||||
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
|
|
||||||
}
|
|
||||||
if (desc.getIndex() == -1) {
|
|
||||||
return UNRECOGNIZED;
|
|
||||||
}
|
|
||||||
return VALUES[desc.getIndex()];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private final int value;
|
|
||||||
|
|
||||||
|
|
||||||
private LayoutEngine(int value) {
|
|
||||||
|
|
||||||
this.value = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
// @@protoc_insertion_point(enum_scope:LayoutEngine)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final LayoutEngine[] VALUES = values();
|
||||||
|
|
||||||
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
|
public static LayoutEngine valueOf(
|
||||||
|
com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
|
||||||
return descriptor;
|
if (desc.getType() != getDescriptor()) {
|
||||||
|
throw new java.lang.IllegalArgumentException(
|
||||||
|
"EnumValueDescriptor is not for this type.");
|
||||||
|
}
|
||||||
|
if (desc.getIndex() == -1) {
|
||||||
|
return UNRECOGNIZED;
|
||||||
|
}
|
||||||
|
return VALUES[desc.getIndex()];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final int value;
|
||||||
|
|
||||||
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
|
private LayoutEngine(int value) {
|
||||||
|
this.value = value;
|
||||||
static {
|
|
||||||
String[] descriptorData = {"\n\022LayoutEngine.proto*2\n\014LayoutEngine\022\r\n\t" + "ALGORITHM\020\000\022\006\n\002AI\020\001\022\013\n\007OUTLINE\020\002b\006proto3"};
|
|
||||||
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
|
|
||||||
descriptor.resolveAllFeaturesImmutable();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// @@protoc_insertion_point(outer_class_scope)
|
// @@protoc_insertion_point(enum_scope:LayoutEngine)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static com.google.protobuf.Descriptors.FileDescriptor
|
||||||
|
getDescriptor() {
|
||||||
|
return descriptor;
|
||||||
|
}
|
||||||
|
private static com.google.protobuf.Descriptors.FileDescriptor
|
||||||
|
descriptor;
|
||||||
|
static {
|
||||||
|
java.lang.String[] descriptorData = {
|
||||||
|
"\n\022LayoutEngine.proto*2\n\014LayoutEngine\022\r\n\t" +
|
||||||
|
"ALGORITHM\020\000\022\006\n\002AI\020\001\022\013\n\007OUTLINE\020\002B[\nFcom." +
|
||||||
|
"knecon.fforesight.service.layoutparser.i" +
|
||||||
|
"nternal.api.data.redactionB\021LayoutEngine" +
|
||||||
|
"Protob\006proto3"
|
||||||
|
};
|
||||||
|
descriptor = com.google.protobuf.Descriptors.FileDescriptor
|
||||||
|
.internalBuildGeneratedFileFrom(descriptorData,
|
||||||
|
new com.google.protobuf.Descriptors.FileDescriptor[] {
|
||||||
|
});
|
||||||
|
descriptor.resolveAllFeaturesImmutable();
|
||||||
|
}
|
||||||
|
|
||||||
|
// @@protoc_insertion_point(outer_class_scope)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,274 +1,261 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
|
||||||
|
|
||||||
import java.util.Locale;
|
|
||||||
// Generated by the protocol buffer compiler. DO NOT EDIT!
|
// Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||||
// NO CHECKED-IN PROTOBUF GENCODE
|
// NO CHECKED-IN PROTOBUF GENCODE
|
||||||
// source: NodeType.proto
|
// source: NodeType.proto
|
||||||
// Protobuf Java Version: 4.27.1
|
// Protobuf Java Version: 4.28.3
|
||||||
@SuppressWarnings("all")
|
|
||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
public final class NodeTypeProto {
|
public final class NodeTypeProto {
|
||||||
|
private NodeTypeProto() {}
|
||||||
|
static {
|
||||||
|
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
|
||||||
|
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||||
|
/* major= */ 4,
|
||||||
|
/* minor= */ 28,
|
||||||
|
/* patch= */ 3,
|
||||||
|
/* suffix= */ "",
|
||||||
|
NodeTypeProto.class.getName());
|
||||||
|
}
|
||||||
|
public static void registerAllExtensions(
|
||||||
|
com.google.protobuf.ExtensionRegistryLite registry) {
|
||||||
|
}
|
||||||
|
|
||||||
private NodeTypeProto() {}
|
public static void registerAllExtensions(
|
||||||
|
com.google.protobuf.ExtensionRegistry registry) {
|
||||||
|
registerAllExtensions(
|
||||||
|
(com.google.protobuf.ExtensionRegistryLite) registry);
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Protobuf enum {@code NodeType}
|
||||||
|
*/
|
||||||
|
public enum NodeType
|
||||||
|
implements com.google.protobuf.ProtocolMessageEnum {
|
||||||
|
/**
|
||||||
|
* <code>DOCUMENT = 0;</code>
|
||||||
|
*/
|
||||||
|
DOCUMENT(0),
|
||||||
|
/**
|
||||||
|
* <code>SECTION = 1;</code>
|
||||||
|
*/
|
||||||
|
SECTION(1),
|
||||||
|
/**
|
||||||
|
* <code>SUPER_SECTION = 2;</code>
|
||||||
|
*/
|
||||||
|
SUPER_SECTION(2),
|
||||||
|
/**
|
||||||
|
* <code>HEADLINE = 3;</code>
|
||||||
|
*/
|
||||||
|
HEADLINE(3),
|
||||||
|
/**
|
||||||
|
* <code>PARAGRAPH = 4;</code>
|
||||||
|
*/
|
||||||
|
PARAGRAPH(4),
|
||||||
|
/**
|
||||||
|
* <code>TABLE = 5;</code>
|
||||||
|
*/
|
||||||
|
TABLE(5),
|
||||||
|
/**
|
||||||
|
* <code>TABLE_CELL = 6;</code>
|
||||||
|
*/
|
||||||
|
TABLE_CELL(6),
|
||||||
|
/**
|
||||||
|
* <code>IMAGE = 7;</code>
|
||||||
|
*/
|
||||||
|
IMAGE(7),
|
||||||
|
/**
|
||||||
|
* <code>HEADER = 8;</code>
|
||||||
|
*/
|
||||||
|
HEADER(8),
|
||||||
|
/**
|
||||||
|
* <code>FOOTER = 9;</code>
|
||||||
|
*/
|
||||||
|
FOOTER(9),
|
||||||
|
/**
|
||||||
|
* <code>TABLE_OF_CONTENTS = 10;</code>
|
||||||
|
*/
|
||||||
|
TABLE_OF_CONTENTS(10),
|
||||||
|
/**
|
||||||
|
* <code>TABLE_OF_CONTENTS_ITEM = 11;</code>
|
||||||
|
*/
|
||||||
|
TABLE_OF_CONTENTS_ITEM(11),
|
||||||
|
UNRECOGNIZED(-1),
|
||||||
|
;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
|
||||||
/* major= */ 4,
|
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
||||||
/* minor= */ 27,
|
/* major= */ 4,
|
||||||
/* patch= */ 1,
|
/* minor= */ 28,
|
||||||
/* suffix= */ "", NodeTypeProto.class.getName());
|
/* patch= */ 3,
|
||||||
|
/* suffix= */ "",
|
||||||
|
NodeType.class.getName());
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* <code>DOCUMENT = 0;</code>
|
||||||
|
*/
|
||||||
|
public static final int DOCUMENT_VALUE = 0;
|
||||||
|
/**
|
||||||
|
* <code>SECTION = 1;</code>
|
||||||
|
*/
|
||||||
|
public static final int SECTION_VALUE = 1;
|
||||||
|
/**
|
||||||
|
* <code>SUPER_SECTION = 2;</code>
|
||||||
|
*/
|
||||||
|
public static final int SUPER_SECTION_VALUE = 2;
|
||||||
|
/**
|
||||||
|
* <code>HEADLINE = 3;</code>
|
||||||
|
*/
|
||||||
|
public static final int HEADLINE_VALUE = 3;
|
||||||
|
/**
|
||||||
|
* <code>PARAGRAPH = 4;</code>
|
||||||
|
*/
|
||||||
|
public static final int PARAGRAPH_VALUE = 4;
|
||||||
|
/**
|
||||||
|
* <code>TABLE = 5;</code>
|
||||||
|
*/
|
||||||
|
public static final int TABLE_VALUE = 5;
|
||||||
|
/**
|
||||||
|
* <code>TABLE_CELL = 6;</code>
|
||||||
|
*/
|
||||||
|
public static final int TABLE_CELL_VALUE = 6;
|
||||||
|
/**
|
||||||
|
* <code>IMAGE = 7;</code>
|
||||||
|
*/
|
||||||
|
public static final int IMAGE_VALUE = 7;
|
||||||
|
/**
|
||||||
|
* <code>HEADER = 8;</code>
|
||||||
|
*/
|
||||||
|
public static final int HEADER_VALUE = 8;
|
||||||
|
/**
|
||||||
|
* <code>FOOTER = 9;</code>
|
||||||
|
*/
|
||||||
|
public static final int FOOTER_VALUE = 9;
|
||||||
|
/**
|
||||||
|
* <code>TABLE_OF_CONTENTS = 10;</code>
|
||||||
|
*/
|
||||||
|
public static final int TABLE_OF_CONTENTS_VALUE = 10;
|
||||||
|
/**
|
||||||
|
* <code>TABLE_OF_CONTENTS_ITEM = 11;</code>
|
||||||
|
*/
|
||||||
|
public static final int TABLE_OF_CONTENTS_ITEM_VALUE = 11;
|
||||||
|
|
||||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
|
|
||||||
|
|
||||||
|
public final int getNumber() {
|
||||||
|
if (this == UNRECOGNIZED) {
|
||||||
|
throw new java.lang.IllegalArgumentException(
|
||||||
|
"Can't get the number of an unknown enum value.");
|
||||||
|
}
|
||||||
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
|
|
||||||
|
|
||||||
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Protobuf enum {@code NodeType}
|
* @param value The numeric wire value of the corresponding enum entry.
|
||||||
|
* @return The enum associated with the given numeric wire value.
|
||||||
|
* @deprecated Use {@link #forNumber(int)} instead.
|
||||||
*/
|
*/
|
||||||
public enum NodeType implements com.google.protobuf.ProtocolMessageEnum {
|
@java.lang.Deprecated
|
||||||
/**
|
public static NodeType valueOf(int value) {
|
||||||
* <code>DOCUMENT = 0;</code>
|
return forNumber(value);
|
||||||
*/
|
}
|
||||||
DOCUMENT(0),
|
|
||||||
/**
|
|
||||||
* <code>SECTION = 1;</code>
|
|
||||||
*/
|
|
||||||
SECTION(1),
|
|
||||||
/**
|
|
||||||
* <code>SUPER_SECTION = 2;</code>
|
|
||||||
*/
|
|
||||||
SUPER_SECTION(2),
|
|
||||||
/**
|
|
||||||
* <code>HEADLINE = 3;</code>
|
|
||||||
*/
|
|
||||||
HEADLINE(3),
|
|
||||||
/**
|
|
||||||
* <code>PARAGRAPH = 4;</code>
|
|
||||||
*/
|
|
||||||
PARAGRAPH(4),
|
|
||||||
/**
|
|
||||||
* <code>TABLE = 5;</code>
|
|
||||||
*/
|
|
||||||
TABLE(5),
|
|
||||||
/**
|
|
||||||
* <code>TABLE_CELL = 6;</code>
|
|
||||||
*/
|
|
||||||
TABLE_CELL(6),
|
|
||||||
/**
|
|
||||||
* <code>IMAGE = 7;</code>
|
|
||||||
*/
|
|
||||||
IMAGE(7),
|
|
||||||
/**
|
|
||||||
* <code>HEADER = 8;</code>
|
|
||||||
*/
|
|
||||||
HEADER(8),
|
|
||||||
/**
|
|
||||||
* <code>FOOTER = 9;</code>
|
|
||||||
*/
|
|
||||||
FOOTER(9),
|
|
||||||
UNRECOGNIZED(-1),
|
|
||||||
;
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param value The numeric wire value of the corresponding enum entry.
|
||||||
|
* @return The enum associated with the given numeric wire value.
|
||||||
|
*/
|
||||||
|
public static NodeType forNumber(int value) {
|
||||||
|
switch (value) {
|
||||||
|
case 0: return DOCUMENT;
|
||||||
|
case 1: return SECTION;
|
||||||
|
case 2: return SUPER_SECTION;
|
||||||
|
case 3: return HEADLINE;
|
||||||
|
case 4: return PARAGRAPH;
|
||||||
|
case 5: return TABLE;
|
||||||
|
case 6: return TABLE_CELL;
|
||||||
|
case 7: return IMAGE;
|
||||||
|
case 8: return HEADER;
|
||||||
|
case 9: return FOOTER;
|
||||||
|
case 10: return TABLE_OF_CONTENTS;
|
||||||
|
case 11: return TABLE_OF_CONTENTS_ITEM;
|
||||||
|
default: return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public String toString() {
|
public static com.google.protobuf.Internal.EnumLiteMap<NodeType>
|
||||||
|
internalGetValueMap() {
|
||||||
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
|
return internalValueMap;
|
||||||
}
|
}
|
||||||
|
private static final com.google.protobuf.Internal.EnumLiteMap<
|
||||||
|
NodeType> internalValueMap =
|
||||||
static {
|
new com.google.protobuf.Internal.EnumLiteMap<NodeType>() {
|
||||||
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
|
|
||||||
/* major= */ 4,
|
|
||||||
/* minor= */ 27,
|
|
||||||
/* patch= */ 1,
|
|
||||||
/* suffix= */ "", NodeType.class.getName());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* <code>DOCUMENT = 0;</code>
|
|
||||||
*/
|
|
||||||
public static final int DOCUMENT_VALUE = 0;
|
|
||||||
/**
|
|
||||||
* <code>SECTION = 1;</code>
|
|
||||||
*/
|
|
||||||
public static final int SECTION_VALUE = 1;
|
|
||||||
/**
|
|
||||||
* <code>SUPER_SECTION = 2;</code>
|
|
||||||
*/
|
|
||||||
public static final int SUPER_SECTION_VALUE = 2;
|
|
||||||
/**
|
|
||||||
* <code>HEADLINE = 3;</code>
|
|
||||||
*/
|
|
||||||
public static final int HEADLINE_VALUE = 3;
|
|
||||||
/**
|
|
||||||
* <code>PARAGRAPH = 4;</code>
|
|
||||||
*/
|
|
||||||
public static final int PARAGRAPH_VALUE = 4;
|
|
||||||
/**
|
|
||||||
* <code>TABLE = 5;</code>
|
|
||||||
*/
|
|
||||||
public static final int TABLE_VALUE = 5;
|
|
||||||
/**
|
|
||||||
* <code>TABLE_CELL = 6;</code>
|
|
||||||
*/
|
|
||||||
public static final int TABLE_CELL_VALUE = 6;
|
|
||||||
/**
|
|
||||||
* <code>IMAGE = 7;</code>
|
|
||||||
*/
|
|
||||||
public static final int IMAGE_VALUE = 7;
|
|
||||||
/**
|
|
||||||
* <code>HEADER = 8;</code>
|
|
||||||
*/
|
|
||||||
public static final int HEADER_VALUE = 8;
|
|
||||||
/**
|
|
||||||
* <code>FOOTER = 9;</code>
|
|
||||||
*/
|
|
||||||
public static final int FOOTER_VALUE = 9;
|
|
||||||
|
|
||||||
|
|
||||||
public final int getNumber() {
|
|
||||||
|
|
||||||
if (this == UNRECOGNIZED) {
|
|
||||||
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
|
|
||||||
}
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param value The numeric wire value of the corresponding enum entry.
|
|
||||||
* @return The enum associated with the given numeric wire value.
|
|
||||||
* @deprecated Use {@link #forNumber(int)} instead.
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public static NodeType valueOf(int value) {
|
|
||||||
|
|
||||||
return forNumber(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param value The numeric wire value of the corresponding enum entry.
|
|
||||||
* @return The enum associated with the given numeric wire value.
|
|
||||||
*/
|
|
||||||
public static NodeType forNumber(int value) {
|
|
||||||
|
|
||||||
switch (value) {
|
|
||||||
case 0:
|
|
||||||
return DOCUMENT;
|
|
||||||
case 1:
|
|
||||||
return SECTION;
|
|
||||||
case 2:
|
|
||||||
return SUPER_SECTION;
|
|
||||||
case 3:
|
|
||||||
return HEADLINE;
|
|
||||||
case 4:
|
|
||||||
return PARAGRAPH;
|
|
||||||
case 5:
|
|
||||||
return TABLE;
|
|
||||||
case 6:
|
|
||||||
return TABLE_CELL;
|
|
||||||
case 7:
|
|
||||||
return IMAGE;
|
|
||||||
case 8:
|
|
||||||
return HEADER;
|
|
||||||
case 9:
|
|
||||||
return FOOTER;
|
|
||||||
default:
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static com.google.protobuf.Internal.EnumLiteMap<NodeType> internalGetValueMap() {
|
|
||||||
|
|
||||||
return internalValueMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static final com.google.protobuf.Internal.EnumLiteMap<NodeType> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<NodeType>() {
|
|
||||||
public NodeType findValueByNumber(int number) {
|
public NodeType findValueByNumber(int number) {
|
||||||
|
return NodeType.forNumber(number);
|
||||||
return NodeType.forNumber(number);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
public final com.google.protobuf.Descriptors.EnumValueDescriptor
|
||||||
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
|
getValueDescriptor() {
|
||||||
|
if (this == UNRECOGNIZED) {
|
||||||
if (this == UNRECOGNIZED) {
|
throw new java.lang.IllegalStateException(
|
||||||
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
|
"Can't get the descriptor of an unrecognized enum value.");
|
||||||
}
|
}
|
||||||
return getDescriptor().getValues()
|
return getDescriptor().getValues().get(ordinal());
|
||||||
.get(ordinal());
|
}
|
||||||
}
|
public final com.google.protobuf.Descriptors.EnumDescriptor
|
||||||
|
getDescriptorForType() {
|
||||||
|
return getDescriptor();
|
||||||
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
|
}
|
||||||
|
public static final com.google.protobuf.Descriptors.EnumDescriptor
|
||||||
return getDescriptor();
|
getDescriptor() {
|
||||||
}
|
return com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.getDescriptor().getEnumTypes().get(0);
|
||||||
|
|
||||||
|
|
||||||
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
|
|
||||||
|
|
||||||
return NodeTypeProto.getDescriptor().getEnumTypes()
|
|
||||||
.get(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static final NodeType[] VALUES = values();
|
|
||||||
|
|
||||||
|
|
||||||
public static NodeType valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
|
|
||||||
|
|
||||||
if (desc.getType() != getDescriptor()) {
|
|
||||||
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
|
|
||||||
}
|
|
||||||
if (desc.getIndex() == -1) {
|
|
||||||
return UNRECOGNIZED;
|
|
||||||
}
|
|
||||||
return VALUES[desc.getIndex()];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private final int value;
|
|
||||||
|
|
||||||
|
|
||||||
private NodeType(int value) {
|
|
||||||
|
|
||||||
this.value = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
// @@protoc_insertion_point(enum_scope:NodeType)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final NodeType[] VALUES = values();
|
||||||
|
|
||||||
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
|
public static NodeType valueOf(
|
||||||
|
com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
|
||||||
return descriptor;
|
if (desc.getType() != getDescriptor()) {
|
||||||
|
throw new java.lang.IllegalArgumentException(
|
||||||
|
"EnumValueDescriptor is not for this type.");
|
||||||
|
}
|
||||||
|
if (desc.getIndex() == -1) {
|
||||||
|
return UNRECOGNIZED;
|
||||||
|
}
|
||||||
|
return VALUES[desc.getIndex()];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final int value;
|
||||||
|
|
||||||
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
|
private NodeType(int value) {
|
||||||
|
this.value = value;
|
||||||
static {
|
|
||||||
String[] descriptorData = {"\n\016NodeType.proto*\223\001\n\010NodeType\022\014\n\010DOCUMEN"
|
|
||||||
+ "T\020\000\022\013\n\007SECTION\020\001\022\021\n\rSUPER_SECTION\020\002\022\014\n\010H"
|
|
||||||
+ "EADLINE\020\003\022\r\n\tPARAGRAPH\020\004\022\t\n\005TABLE\020\005\022\016\n\nT"
|
|
||||||
+ "ABLE_CELL\020\006\022\t\n\005IMAGE\020\007\022\n\n\006HEADER\020\010\022\n\n\006FO"
|
|
||||||
+ "OTER\020\tb\006proto3"};
|
|
||||||
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
|
|
||||||
descriptor.resolveAllFeaturesImmutable();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// @@protoc_insertion_point(outer_class_scope)
|
// @@protoc_insertion_point(enum_scope:NodeType)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static com.google.protobuf.Descriptors.FileDescriptor
|
||||||
|
getDescriptor() {
|
||||||
|
return descriptor;
|
||||||
|
}
|
||||||
|
private static com.google.protobuf.Descriptors.FileDescriptor
|
||||||
|
descriptor;
|
||||||
|
static {
|
||||||
|
java.lang.String[] descriptorData = {
|
||||||
|
"\n\016NodeType.proto*\306\001\n\010NodeType\022\014\n\010DOCUMEN" +
|
||||||
|
"T\020\000\022\013\n\007SECTION\020\001\022\021\n\rSUPER_SECTION\020\002\022\014\n\010H" +
|
||||||
|
"EADLINE\020\003\022\r\n\tPARAGRAPH\020\004\022\t\n\005TABLE\020\005\022\016\n\nT" +
|
||||||
|
"ABLE_CELL\020\006\022\t\n\005IMAGE\020\007\022\n\n\006HEADER\020\010\022\n\n\006FO" +
|
||||||
|
"OTER\020\t\022\025\n\021TABLE_OF_CONTENTS\020\n\022\032\n\026TABLE_O" +
|
||||||
|
"F_CONTENTS_ITEM\020\013BW\nFcom.knecon.fforesig" +
|
||||||
|
"ht.service.layoutparser.internal.api.dat" +
|
||||||
|
"a.redactionB\rNodeTypeProtob\006proto3"
|
||||||
|
};
|
||||||
|
descriptor = com.google.protobuf.Descriptors.FileDescriptor
|
||||||
|
.internalBuildGeneratedFileFrom(descriptorData,
|
||||||
|
new com.google.protobuf.Descriptors.FileDescriptor[] {
|
||||||
|
});
|
||||||
|
descriptor.resolveAllFeaturesImmutable();
|
||||||
|
}
|
||||||
|
|
||||||
|
// @@protoc_insertion_point(outer_class_scope)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,9 @@
|
|||||||
syntax = "proto3";
|
syntax = "proto3";
|
||||||
|
|
||||||
|
option java_outer_classname = "DocumentPageProto";
|
||||||
|
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
|
||||||
|
|
||||||
|
|
||||||
message AllDocumentPages {
|
message AllDocumentPages {
|
||||||
|
|
||||||
repeated DocumentPage documentPages = 1;
|
repeated DocumentPage documentPages = 1;
|
||||||
|
|||||||
@ -1,5 +1,8 @@
|
|||||||
syntax = "proto3";
|
syntax = "proto3";
|
||||||
|
|
||||||
|
option java_outer_classname = "DocumentPositionDataProto";
|
||||||
|
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
|
||||||
|
|
||||||
message AllDocumentPositionData {
|
message AllDocumentPositionData {
|
||||||
|
|
||||||
repeated DocumentPositionData documentPositionData = 1;
|
repeated DocumentPositionData documentPositionData = 1;
|
||||||
|
|||||||
@ -1,5 +1,9 @@
|
|||||||
syntax = "proto3";
|
syntax = "proto3";
|
||||||
|
|
||||||
|
option java_outer_classname = "DocumentStructureProto";
|
||||||
|
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
|
||||||
|
|
||||||
|
|
||||||
import "EntryData.proto";
|
import "EntryData.proto";
|
||||||
|
|
||||||
message DocumentStructure {
|
message DocumentStructure {
|
||||||
|
|||||||
@ -1,5 +1,8 @@
|
|||||||
syntax = "proto3";
|
syntax = "proto3";
|
||||||
|
|
||||||
|
option java_outer_classname = "DocumentTextDataProto";
|
||||||
|
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
|
||||||
|
|
||||||
message AllDocumentTextData {
|
message AllDocumentTextData {
|
||||||
|
|
||||||
repeated DocumentTextData documentTextData = 1;
|
repeated DocumentTextData documentTextData = 1;
|
||||||
|
|||||||
@ -3,6 +3,9 @@ syntax = "proto3";
|
|||||||
import "LayoutEngine.proto";
|
import "LayoutEngine.proto";
|
||||||
import "NodeType.proto";
|
import "NodeType.proto";
|
||||||
|
|
||||||
|
option java_outer_classname = "EntryDataProto";
|
||||||
|
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
|
||||||
|
|
||||||
message EntryData {
|
message EntryData {
|
||||||
// Type of the semantic node.
|
// Type of the semantic node.
|
||||||
NodeType type = 1;
|
NodeType type = 1;
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
syntax = "proto3";
|
syntax = "proto3";
|
||||||
|
option java_outer_classname = "LayoutEngineProto";
|
||||||
|
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
|
||||||
enum LayoutEngine {
|
enum LayoutEngine {
|
||||||
ALGORITHM = 0;
|
ALGORITHM = 0;
|
||||||
AI = 1;
|
AI = 1;
|
||||||
|
|||||||
@ -1,5 +1,8 @@
|
|||||||
syntax = "proto3";
|
syntax = "proto3";
|
||||||
|
|
||||||
|
option java_outer_classname = "NodeTypeProto";
|
||||||
|
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
|
||||||
|
|
||||||
enum NodeType {
|
enum NodeType {
|
||||||
DOCUMENT = 0;
|
DOCUMENT = 0;
|
||||||
SECTION = 1;
|
SECTION = 1;
|
||||||
@ -11,4 +14,6 @@ enum NodeType {
|
|||||||
IMAGE = 7;
|
IMAGE = 7;
|
||||||
HEADER = 8;
|
HEADER = 8;
|
||||||
FOOTER = 9;
|
FOOTER = 9;
|
||||||
|
TABLE_OF_CONTENTS = 10;
|
||||||
|
TABLE_OF_CONTENTS_ITEM = 11;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,26 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Minimum required protoc version
|
||||||
|
MIN_VERSION="28.3"
|
||||||
|
|
||||||
|
# Get the installed protoc version
|
||||||
|
INSTALLED_VERSION=$(protoc --version | awk '{print $2}')
|
||||||
|
|
||||||
|
# Function to compare versions
|
||||||
|
version_lt() {
|
||||||
|
[ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" != "$1" ]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if protoc is installed and meets the minimum version
|
||||||
|
if ! command -v protoc &> /dev/null; then
|
||||||
|
echo "Error: protoc is not installed. Please install version $MIN_VERSION or later."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if version_lt "$INSTALLED_VERSION" "$MIN_VERSION"; then
|
||||||
|
echo "Error: protoc version $INSTALLED_VERSION is too old. Please upgrade to version $MIN_VERSION or later."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Generate Java files from proto files
|
||||||
|
protoc --java_out=../java ./*.proto
|
||||||
@ -35,6 +35,4 @@ dependencies {
|
|||||||
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||||
implementation("com.pdftron:PDFNet:10.11.0")
|
implementation("com.pdftron:PDFNet:10.11.0")
|
||||||
implementation("org.apache.commons:commons-text:1.12.0")
|
implementation("org.apache.commons:commons-text:1.12.0")
|
||||||
|
|
||||||
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -39,10 +39,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
@ -107,7 +106,7 @@ public class LayoutParsingPipeline {
|
|||||||
GraphicExtractorService graphicExtractorService;
|
GraphicExtractorService graphicExtractorService;
|
||||||
OutlineExtractorService outlineExtractorService;
|
OutlineExtractorService outlineExtractorService;
|
||||||
OutlineValidationService outlineValidationService;
|
OutlineValidationService outlineValidationService;
|
||||||
TOCEnrichmentService tocEnrichmentService;
|
SectionTreeBuilderService sectionTreeBuilderService;
|
||||||
LayoutparserSettings settings;
|
LayoutparserSettings settings;
|
||||||
ClassificationService classificationService;
|
ClassificationService classificationService;
|
||||||
|
|
||||||
@ -345,14 +344,14 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||||
|
|
||||||
TableOfContents tableOfContents = outlineValidationService.createToC(classificationDocument);
|
SectionTree sectionTree = outlineValidationService.createSectionTree(classificationDocument);
|
||||||
classificationDocument.setTableOfContents(tableOfContents);
|
classificationDocument.setSectionTree(sectionTree);
|
||||||
|
|
||||||
log.info("Building Sections for {}", identifier);
|
log.info("Building Sections for {}", identifier);
|
||||||
|
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||||
default -> tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
|
default -> sectionTreeBuilderService.assignSectionBlocksAndImages(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
return classificationDocument;
|
return classificationDocument;
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||||
@ -31,6 +31,6 @@ public class ClassificationDocument {
|
|||||||
private long rulesVersion;
|
private long rulesVersion;
|
||||||
|
|
||||||
private OutlineObjectTree outlineObjectTree;
|
private OutlineObjectTree outlineObjectTree;
|
||||||
private TableOfContents tableOfContents;
|
private SectionTree sectionTree;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,6 +14,7 @@ public enum PageBlockType {
|
|||||||
PARAGRAPH_ITALIC,
|
PARAGRAPH_ITALIC,
|
||||||
PARAGRAPH_UNKNOWN,
|
PARAGRAPH_UNKNOWN,
|
||||||
OTHER,
|
OTHER,
|
||||||
|
TABLE_OF_CONTENTS_HEADLINE,
|
||||||
TABLE_OF_CONTENTS_ITEM,
|
TABLE_OF_CONTENTS_ITEM,
|
||||||
LIST_ITEM,
|
LIST_ITEM,
|
||||||
TABLE;
|
TABLE;
|
||||||
@ -35,7 +36,7 @@ public enum PageBlockType {
|
|||||||
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||||
|
|
||||||
return switch (pageBlockType) {
|
return switch (pageBlockType) {
|
||||||
case H1 -> 1;
|
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
|
||||||
case H2 -> 2;
|
case H2 -> 2;
|
||||||
case H3 -> 3;
|
case H3 -> 3;
|
||||||
case H4 -> 4;
|
case H4 -> 4;
|
||||||
@ -47,6 +48,6 @@ public enum PageBlockType {
|
|||||||
|
|
||||||
public boolean isHeadline() {
|
public boolean isHeadline() {
|
||||||
|
|
||||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@ import lombok.experimental.FieldDefaults;
|
|||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class SectionIdentifier {
|
public class SectionIdentifier {
|
||||||
|
|
||||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?");
|
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?");
|
||||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
||||||
|
|
||||||
public enum Format {
|
public enum Format {
|
||||||
|
|||||||
@ -11,6 +11,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||||
|
|
||||||
public abstract class AbstractNodeVisitor implements NodeVisitor {
|
public abstract class AbstractNodeVisitor implements NodeVisitor {
|
||||||
|
|
||||||
@ -83,6 +85,18 @@ public abstract class AbstractNodeVisitor implements NodeVisitor {
|
|||||||
visitChildren(tableCell);
|
visitChildren(tableCell);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(TableOfContents toc) {
|
||||||
|
|
||||||
|
visitChildren(toc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(TableOfContentsItem toci) {
|
||||||
|
|
||||||
|
visitChildren(toci);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
protected void visitChildren(SemanticNode semanticNode) {
|
protected void visitChildren(SemanticNode semanticNode) {
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||||
|
|
||||||
|
import software.amazon.awssdk.utils.builder.ToCopyableBuilder;
|
||||||
|
|
||||||
public interface NodeVisitor {
|
public interface NodeVisitor {
|
||||||
|
|
||||||
@ -42,4 +46,10 @@ public interface NodeVisitor {
|
|||||||
|
|
||||||
void visit(TableCell tableCell);
|
void visit(TableCell tableCell);
|
||||||
|
|
||||||
|
|
||||||
|
void visit(TableOfContents tableOfContents);
|
||||||
|
|
||||||
|
|
||||||
|
void visit(TableOfContentsItem tableOfContentsItem);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,41 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
public class TableOfContents extends AbstractSemanticNode {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeTypeProto.NodeType getType() {
|
||||||
|
|
||||||
|
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Headline getHeadline() {
|
||||||
|
|
||||||
|
return streamChildrenOfType(NodeTypeProto.NodeType.HEADLINE).map(node -> (Headline) node)
|
||||||
|
.findFirst()
|
||||||
|
.orElseGet(() -> getParent().getHeadline());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void accept(NodeVisitor visitor) {
|
||||||
|
|
||||||
|
visitor.visit(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,51 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
public class TableOfContentsItem extends AbstractSemanticNode {
|
||||||
|
|
||||||
|
TextBlock leafTextBlock;
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeTypeProto.NodeType getType() {
|
||||||
|
|
||||||
|
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS_ITEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isLeaf() {
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void accept(NodeVisitor visitor) {
|
||||||
|
|
||||||
|
visitor.visit(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
return leafTextBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,5 +1,6 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -21,20 +22,20 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
public class OutlineValidationService {
|
public class OutlineValidationService {
|
||||||
|
|
||||||
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
||||||
public TableOfContents createToC(ClassificationDocument classificationDocument) {
|
public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
|
||||||
|
|
||||||
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
|
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
|
||||||
|
|
||||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
List<SectionTreeEntry> mainSections = new ArrayList<>();
|
||||||
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
|
Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
|
||||||
TableOfContentItem last = null;
|
SectionTreeEntry last = null;
|
||||||
TreeSet<Integer> depths = new TreeSet<>();
|
TreeSet<Integer> depths = new TreeSet<>();
|
||||||
|
|
||||||
for (TextPageBlock current : headlines) {
|
for (TextPageBlock current : headlines) {
|
||||||
int currentDepth = getHeadlineNumber(current.getClassification());
|
int currentDepth = getHeadlineNumber(current.getClassification());
|
||||||
Integer parentDepth = depths.floor(currentDepth - 1);
|
Integer parentDepth = depths.floor(currentDepth - 1);
|
||||||
|
|
||||||
var tocItem = new TableOfContentItem(current);
|
var tocItem = new SectionTreeEntry(current);
|
||||||
|
|
||||||
if (parentDepth == null) {
|
if (parentDepth == null) {
|
||||||
mainSections.add(tocItem);
|
mainSections.add(tocItem);
|
||||||
@ -44,14 +45,16 @@ public class OutlineValidationService {
|
|||||||
} else {
|
} else {
|
||||||
assert last != null;
|
assert last != null;
|
||||||
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
|
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
|
||||||
|
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
|
||||||
if (lastDepth < parentDepth) {
|
// headline after toc should always start a main section
|
||||||
|
parentDepth = 1;
|
||||||
|
} else if (lastDepth < parentDepth) {
|
||||||
parentDepth = lastDepth;
|
parentDepth = lastDepth;
|
||||||
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
||||||
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
|
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
|
||||||
}
|
}
|
||||||
|
|
||||||
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
|
SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
|
||||||
parent.addChild(tocItem);
|
parent.addChild(tocItem);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -60,7 +63,10 @@ public class OutlineValidationService {
|
|||||||
depths.add(currentDepth);
|
depths.add(currentDepth);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new TableOfContents(mainSections);
|
return new
|
||||||
|
|
||||||
|
SectionTree(mainSections);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -14,12 +14,12 @@ import lombok.RequiredArgsConstructor;
|
|||||||
|
|
||||||
@Data
|
@Data
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class TableOfContents implements Iterable<TableOfContentItem> {
|
public class SectionTree implements Iterable<SectionTreeEntry> {
|
||||||
|
|
||||||
private List<TableOfContentItem> mainSections = new ArrayList<>();
|
private List<SectionTreeEntry> mainSections = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
public TableOfContents(List<TableOfContentItem> mainSections) {
|
public SectionTree(List<SectionTreeEntry> mainSections) {
|
||||||
|
|
||||||
this.mainSections = mainSections;
|
this.mainSections = mainSections;
|
||||||
}
|
}
|
||||||
@ -28,36 +28,36 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
|||||||
public List<TextPageBlock> getAllTextPageBlocks() {
|
public List<TextPageBlock> getAllTextPageBlocks() {
|
||||||
|
|
||||||
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
|
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
|
||||||
for (TableOfContentItem item : mainSections) {
|
for (SectionTreeEntry item : mainSections) {
|
||||||
collectTextPageBlocks(item, allTextPageBlocks);
|
collectTextPageBlocks(item, allTextPageBlocks);
|
||||||
}
|
}
|
||||||
return allTextPageBlocks;
|
return allTextPageBlocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
|
private void collectTextPageBlocks(SectionTreeEntry item, List<TextPageBlock> textPageBlocks) {
|
||||||
|
|
||||||
textPageBlocks.add(item.getHeadline());
|
textPageBlocks.add(item.getHeadline());
|
||||||
for (TableOfContentItem child : item.getChildren()) {
|
for (SectionTreeEntry child : item.getChildren()) {
|
||||||
collectTextPageBlocks(child, textPageBlocks);
|
collectTextPageBlocks(child, textPageBlocks);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<TableOfContentItem> getAllTableOfContentItems() {
|
public List<SectionTreeEntry> getAllTableOfContentItems() {
|
||||||
|
|
||||||
List<TableOfContentItem> allItems = new ArrayList<>();
|
List<SectionTreeEntry> allItems = new ArrayList<>();
|
||||||
for (TableOfContentItem item : mainSections) {
|
for (SectionTreeEntry item : mainSections) {
|
||||||
collectTableOfContentItems(item, allItems);
|
collectTableOfContentItems(item, allItems);
|
||||||
}
|
}
|
||||||
return allItems;
|
return allItems;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
|
private void collectTableOfContentItems(SectionTreeEntry item, List<SectionTreeEntry> allItems) {
|
||||||
|
|
||||||
allItems.add(item);
|
allItems.add(item);
|
||||||
for (TableOfContentItem child : item.getChildren()) {
|
for (SectionTreeEntry child : item.getChildren()) {
|
||||||
collectTableOfContentItems(child, allItems);
|
collectTableOfContentItems(child, allItems);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -65,7 +65,7 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
|||||||
|
|
||||||
private boolean containsBlock(TextPageBlock block) {
|
private boolean containsBlock(TextPageBlock block) {
|
||||||
|
|
||||||
for (TableOfContentItem existingItem : this.getMainSections()) {
|
for (SectionTreeEntry existingItem : this.getMainSections()) {
|
||||||
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
|
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -74,9 +74,9 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean containsItem(TableOfContentItem tocItem) {
|
private boolean containsItem(SectionTreeEntry tocItem) {
|
||||||
|
|
||||||
for (TableOfContentItem existingItem : this.getMainSections()) {
|
for (SectionTreeEntry existingItem : this.getMainSections()) {
|
||||||
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
|
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -86,18 +86,18 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public @NonNull Iterator<TableOfContentItem> iterator() {
|
public @NonNull Iterator<SectionTreeEntry> iterator() {
|
||||||
|
|
||||||
return new TableOfContentItemIterator(mainSections);
|
return new SectionTreeEntryIterator(mainSections);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
|
private static class SectionTreeEntryIterator implements Iterator<SectionTreeEntry> {
|
||||||
|
|
||||||
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
|
private final Stack<Iterator<SectionTreeEntry>> stack = new Stack<>();
|
||||||
|
|
||||||
|
|
||||||
TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
|
SectionTreeEntryIterator(List<SectionTreeEntry> mainSections) {
|
||||||
|
|
||||||
stack.push(mainSections.iterator());
|
stack.push(mainSections.iterator());
|
||||||
}
|
}
|
||||||
@ -112,10 +112,10 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TableOfContentItem next() {
|
public SectionTreeEntry next() {
|
||||||
|
|
||||||
ensureStackTopIsCurrent();
|
ensureStackTopIsCurrent();
|
||||||
TableOfContentItem currentItem = stack.peek().next();
|
SectionTreeEntry currentItem = stack.peek().next();
|
||||||
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
|
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
|
||||||
stack.push(currentItem.getChildren()
|
stack.push(currentItem.getChildren()
|
||||||
.iterator());
|
.iterator());
|
||||||
@ -23,28 +23,28 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
public class TOCEnrichmentService {
|
public class SectionTreeBuilderService {
|
||||||
|
|
||||||
public void assignSectionBlocksAndImages(ClassificationDocument document) {
|
public void assignSectionBlocksAndImages(ClassificationDocument document) {
|
||||||
|
|
||||||
TableOfContents toc = document.getTableOfContents();
|
SectionTree toc = document.getSectionTree();
|
||||||
Iterator<TableOfContentItem> iterator = toc.iterator();
|
Iterator<SectionTreeEntry> iterator = toc.iterator();
|
||||||
TableOfContentItem currentTOCItem = null;
|
SectionTreeEntry currentTOCItem = null;
|
||||||
if (iterator.hasNext()) {
|
if (iterator.hasNext()) {
|
||||||
currentTOCItem = iterator.next();
|
currentTOCItem = iterator.next();
|
||||||
}
|
}
|
||||||
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
||||||
List<ClassifiedImage> startImages = new ArrayList<>();
|
List<ClassifiedImage> startImages = new ArrayList<>();
|
||||||
TableOfContentItem currentSection = null;
|
SectionTreeEntry currentSection = null;
|
||||||
boolean foundFirstHeadline = false;
|
boolean foundFirstHeadline = false;
|
||||||
|
|
||||||
List<ClassificationHeader> headers = new ArrayList<>();
|
List<ClassificationHeader> headers = new ArrayList<>();
|
||||||
List<ClassificationFooter> footers = new ArrayList<>();
|
List<ClassificationFooter> footers = new ArrayList<>();
|
||||||
TablePageBlock previousTable = null;
|
TablePageBlock previousTable = null;
|
||||||
List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>();
|
List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
List<TableOfContentItem> currentPageTOCItems = new ArrayList<>();
|
List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
|
||||||
List<TextPageBlock> header = new ArrayList<>();
|
List<TextPageBlock> header = new ArrayList<>();
|
||||||
List<TextPageBlock> footer = new ArrayList<>();
|
List<TextPageBlock> footer = new ArrayList<>();
|
||||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||||
@ -101,7 +101,7 @@ public class TOCEnrichmentService {
|
|||||||
Double xMax = null;
|
Double xMax = null;
|
||||||
Double yMax = null;
|
Double yMax = null;
|
||||||
|
|
||||||
for (TableOfContentItem tocItem : lastFoundTOCItems) {
|
for (SectionTreeEntry tocItem : lastFoundTOCItems) {
|
||||||
var headline = tocItem.getHeadline();
|
var headline = tocItem.getHeadline();
|
||||||
|
|
||||||
if (headline.getPage() != page.getPageNumber()) {
|
if (headline.getPage() != page.getPageNumber()) {
|
||||||
@ -169,10 +169,10 @@ public class TOCEnrichmentService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
|
if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
|
||||||
TableOfContentItem unassigned = new TableOfContentItem(null);
|
SectionTreeEntry unassigned = new SectionTreeEntry(null);
|
||||||
unassigned.setSectionBlocks(startBlocks);
|
unassigned.setSectionBlocks(startBlocks);
|
||||||
unassigned.setImages(startImages);
|
unassigned.setImages(startImages);
|
||||||
document.getTableOfContents().getMainSections().add(0, unassigned);
|
document.getSectionTree().getMainSections().add(0, unassigned);
|
||||||
}
|
}
|
||||||
document.setHeaders(headers);
|
document.setHeaders(headers);
|
||||||
document.setFooters(footers);
|
document.setFooters(footers);
|
||||||
@ -5,6 +5,7 @@ import java.util.List;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
@ -14,12 +15,18 @@ import lombok.EqualsAndHashCode;
|
|||||||
|
|
||||||
@Data
|
@Data
|
||||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||||
public class TableOfContentItem {
|
public class SectionTreeEntry {
|
||||||
|
|
||||||
|
public enum Type {
|
||||||
|
SECTION,
|
||||||
|
SUPER_SECTION,
|
||||||
|
TOC_SECTION
|
||||||
|
}
|
||||||
|
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
private TextPageBlock headline;
|
private TextPageBlock headline;
|
||||||
private List<TableOfContentItem> children = new ArrayList<>();
|
private List<SectionTreeEntry> children = new ArrayList<>();
|
||||||
private TableOfContentItem parent;
|
private SectionTreeEntry parent;
|
||||||
|
|
||||||
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
|
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
|
||||||
private List<ClassifiedImage> images = new ArrayList<>();
|
private List<ClassifiedImage> images = new ArrayList<>();
|
||||||
@ -27,20 +34,32 @@ public class TableOfContentItem {
|
|||||||
private GenericSemanticNode section;
|
private GenericSemanticNode section;
|
||||||
|
|
||||||
|
|
||||||
public TableOfContentItem(TextPageBlock headline) {
|
public SectionTreeEntry(TextPageBlock headline) {
|
||||||
|
|
||||||
this.headline = headline;
|
this.headline = headline;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addChild(TableOfContentItem tableOfContentItem) {
|
public Type getType() {
|
||||||
|
|
||||||
children.add(tableOfContentItem);
|
if (headline.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_HEADLINE)) {
|
||||||
tableOfContentItem.setParent(this);
|
return Type.TOC_SECTION;
|
||||||
|
}
|
||||||
|
if (children.isEmpty()) {
|
||||||
|
return Type.SECTION;
|
||||||
|
}
|
||||||
|
return Type.SUPER_SECTION;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TableOfContentItem getSiblingBefore() {
|
public void addChild(SectionTreeEntry sectionTreeEntry) {
|
||||||
|
|
||||||
|
children.add(sectionTreeEntry);
|
||||||
|
sectionTreeEntry.setParent(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public SectionTreeEntry getSiblingBefore() {
|
||||||
|
|
||||||
if (parent != null) {
|
if (parent != null) {
|
||||||
int index = parent.getChildren().indexOf(this);
|
int index = parent.getChildren().indexOf(this);
|
||||||
@ -52,7 +71,7 @@ public class TableOfContentItem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TableOfContentItem getSiblingAfter() {
|
public SectionTreeEntry getSiblingAfter() {
|
||||||
|
|
||||||
if (parent != null) {
|
if (parent != null) {
|
||||||
int index = parent.getChildren().indexOf(this);
|
int index = parent.getChildren().indexOf(this);
|
||||||
@ -69,7 +88,7 @@ public class TableOfContentItem {
|
|||||||
if (headline.equals(block)) {
|
if (headline.equals(block)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
for (TableOfContentItem child : children) {
|
for (SectionTreeEntry child : children) {
|
||||||
if (child.contains(block)) {
|
if (child.contains(block)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -78,12 +97,12 @@ public class TableOfContentItem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(TableOfContentItem tocItem) {
|
public boolean contains(SectionTreeEntry tocItem) {
|
||||||
|
|
||||||
if (this.equals(tocItem)) {
|
if (this.equals(tocItem)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
for (TableOfContentItem child : children) {
|
for (SectionTreeEntry child : children) {
|
||||||
if (child.contains(tocItem)) {
|
if (child.contains(tocItem)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -1,34 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashMap;
|
|
||||||
|
|
||||||
public class TextPositionSequenceComparator implements Comparator<Word> {
|
|
||||||
|
|
||||||
private HashMap<Word, TextBlockOnPage> lookup;
|
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequenceComparator(HashMap<Word, TextBlockOnPage> lookup) {
|
|
||||||
|
|
||||||
this.lookup = lookup;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(Word number1, Word number2) {
|
|
||||||
|
|
||||||
int page1 = lookup.get(number1).page().getPageNumber();
|
|
||||||
int page2 = lookup.get(number2).page().getPageNumber();
|
|
||||||
|
|
||||||
if (page1 != page2) {
|
|
||||||
return Integer.compare(page1, page2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (number1.getY() != number2.getY()) {
|
|
||||||
return Double.compare(number1.getY(), number2.getY());
|
|
||||||
}
|
|
||||||
|
|
||||||
return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString()));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -0,0 +1,36 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||||
|
|
||||||
|
public class TocNumberComparator implements Comparator<NumberWord> {
|
||||||
|
|
||||||
|
private HashMap<NumberWord, TextBlockOnPage> lookup;
|
||||||
|
|
||||||
|
|
||||||
|
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
|
this.lookup = lookup;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(NumberWord number1, NumberWord number2) {
|
||||||
|
|
||||||
|
int page1 = lookup.get(number1).page().getPageNumber();
|
||||||
|
int page2 = lookup.get(number2).page().getPageNumber();
|
||||||
|
|
||||||
|
if (page1 != page2) {
|
||||||
|
return Integer.compare(page1, page2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (number1.word().getY() != number2.word().getY()) {
|
||||||
|
return Double.compare(number1.word().getY(), number2.word().getY());
|
||||||
|
}
|
||||||
|
|
||||||
|
return Integer.compare(number1.number(), number2.number());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,7 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
|
||||||
|
public record NumberWord(Word word, int number) {
|
||||||
|
|
||||||
|
}
|
||||||
@ -14,6 +14,7 @@ import java.util.List;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
@ -23,10 +24,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||||
|
|
||||||
@ -59,7 +61,7 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
if (end > i + 1) {
|
if (end > i + 1) {
|
||||||
if (textBlock.textBlock().getClassification() == null) {
|
if (textBlock.textBlock().getClassification() == null) {
|
||||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
textBlock.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_HEADLINE);
|
||||||
}
|
}
|
||||||
i = end;
|
i = end;
|
||||||
}
|
}
|
||||||
@ -71,9 +73,9 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
ClassificationPage startPage = textBlocks.get(start).page();
|
ClassificationPage startPage = textBlocks.get(start).page();
|
||||||
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||||
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
|
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
|
||||||
List<Word> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
|
List<NumberWord> numbers = extractNumbers(initialLookAhead, numberToBlockLookup, document.getPages().size());
|
||||||
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
|
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, numberToBlockLookup);
|
||||||
|
|
||||||
int lastCandidate = start;
|
int lastCandidate = start;
|
||||||
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
||||||
@ -93,28 +95,28 @@ public class TableOfContentsClassificationService {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
|
List<NumberWord> numbersFromBlock = extractNumbers(textBlockOnPage, numberToBlockLookup, document.getPages().size());
|
||||||
|
|
||||||
List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
List<NumberWord> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
||||||
|
|
||||||
if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
|
if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
|
||||||
log.debug("No numbers indicating a table of contents here.");
|
log.debug("No numbers indicating a table of contents here.");
|
||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
|
if (anyIntersection(currentRightmostCluster, numbersFromBlock, numberToBlockLookup)) {
|
||||||
lastCandidate = i;
|
lastCandidate = i;
|
||||||
numbersFromBlock.forEach(tocNumberFinder::add);
|
numbersFromBlock.forEach(tocNumberFinder::add);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
|
|
||||||
|
|
||||||
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
||||||
.stream()
|
.stream()
|
||||||
.map(lookup::get)
|
.map(numberToBlockLookup::get)
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, numberToBlockLookup, blocksWithNumberInCluster, textBlocks.get(start - 1));
|
||||||
|
|
||||||
int lastConfirmed = start;
|
int lastConfirmed = start;
|
||||||
for (int i = start; i < lastCandidate + 1; i++) {
|
for (int i = start; i < lastCandidate + 1; i++) {
|
||||||
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
||||||
@ -132,18 +134,22 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<Word, TextBlockOnPage> lookup) {
|
private static void addVisualization(LayoutDebugLayer layoutDebugLayer,
|
||||||
|
TocNumberFinder tocNumberFinder,
|
||||||
|
Map<NumberWord, TextBlockOnPage> lookup,
|
||||||
|
Set<TextBlockOnPage> blocksWithNumberInCluster,
|
||||||
|
TextBlockOnPage startingHeadline) {
|
||||||
|
|
||||||
tocNumberFinder.getCurrentRightmostCluster()
|
tocNumberFinder.getCurrentRightmostCluster()
|
||||||
.stream()
|
.stream()
|
||||||
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
||||||
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
||||||
|
layoutDebugLayer.addTocBlocks(blocksWithNumberInCluster);
|
||||||
|
layoutDebugLayer.addTocBlocks(Set.of(startingHeadline));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean anyIntersection(Collection<Word> numbers1,
|
private static boolean anyIntersection(Collection<NumberWord> numbers1, Collection<NumberWord> numbers2, Map<NumberWord, TextBlockOnPage> lookup) {
|
||||||
Collection<Word> numbers2,
|
|
||||||
Map<Word, TextBlockOnPage> lookup) {
|
|
||||||
|
|
||||||
return numbers1.stream()
|
return numbers1.stream()
|
||||||
.anyMatch(numberFromCluster -> numbers2.stream()
|
.anyMatch(numberFromCluster -> numbers2.stream()
|
||||||
@ -151,9 +157,9 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Word> extractNumbers(List<TextBlockOnPage> textBlocks, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
|
private static List<NumberWord> extractNumbers(List<TextBlockOnPage> textBlocks, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
|
||||||
|
|
||||||
List<Word> blocks = new LinkedList<>();
|
List<NumberWord> blocks = new LinkedList<>();
|
||||||
for (TextBlockOnPage textBlock : textBlocks) {
|
for (TextBlockOnPage textBlock : textBlocks) {
|
||||||
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
|
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
|
||||||
}
|
}
|
||||||
@ -161,30 +167,40 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Word> extractNumbers(TextBlockOnPage textBlock, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
|
private static List<NumberWord> extractNumbers(TextBlockOnPage textBlock, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
|
||||||
|
|
||||||
List<Word> blocks = new LinkedList<>();
|
List<NumberWord> blocks = new LinkedList<>();
|
||||||
TextPageBlock block = textBlock.textBlock();
|
TextPageBlock block = textBlock.textBlock();
|
||||||
List<Word> sequences = block.getWords();
|
List<Word> words = block.getWords();
|
||||||
for (int i = 0; i < sequences.size(); i++) {
|
for (int i = 0; i < words.size(); i++) {
|
||||||
|
|
||||||
Word word = sequences.get(i);
|
Word word = words.get(i);
|
||||||
|
if (!wordIsEndOfLine(i, words)) {
|
||||||
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) {
|
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString());
|
||||||
|
if (matcher.find() && matcher.group(2) != null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Matcher numberFinder = NUMERIC.matcher(word);
|
||||||
|
if (!numberFinder.find() || word.length() > 5) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
int pageNumber = Integer.parseInt(word.toString());
|
int pageNumber = Integer.parseInt(numberFinder.group());
|
||||||
if (0 >= pageNumber || pageNumber > numberOfPages) {
|
if (0 >= pageNumber || pageNumber > numberOfPages) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
lookup.put(word, textBlock);
|
NumberWord numberWord = new NumberWord(word, pageNumber);
|
||||||
blocks.add(word);
|
lookup.put(numberWord, textBlock);
|
||||||
|
blocks.add(numberWord);
|
||||||
} catch (NumberFormatException e) {
|
} catch (NumberFormatException e) {
|
||||||
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
|
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
|
||||||
}
|
}
|
||||||
@ -193,6 +209,17 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean wordIsEndOfLine(int i, List<Word> words) {
|
||||||
|
|
||||||
|
if (i == words.size() - 1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
Word word = words.get(i);
|
||||||
|
Word nextWord = words.get(i + 1);
|
||||||
|
return !nextWord.rightOf(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
|
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
|
||||||
|
|
||||||
int end = Math.min(i + 5, sequences.size());
|
int end = Math.min(i + 5, sequences.size());
|
||||||
@ -203,13 +230,13 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean matches(Word number1, Word number2, Map<Word, TextBlockOnPage> lookup) {
|
private static boolean matches(NumberWord number1, NumberWord number2, Map<NumberWord, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
if (number1.getDir() != number2.getDir()) {
|
if (number1.word().getDir() != number2.word().getDir()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE);
|
return number1.word().intersectsXDirAdj(number2.word(), INTERSECTION_TOLERANCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -247,11 +274,11 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
private static class TocNumberFinder {
|
private static class TocNumberFinder {
|
||||||
|
|
||||||
final UnionFind<Word> numberClusters;
|
final UnionFind<NumberWord> numberClusters;
|
||||||
final HashMap<Word, TextBlockOnPage> lookup;
|
final HashMap<NumberWord, TextBlockOnPage> lookup;
|
||||||
|
|
||||||
|
|
||||||
TocNumberFinder(List<Word> blocks, HashMap<Word, TextBlockOnPage> lookup) {
|
TocNumberFinder(List<NumberWord> blocks, HashMap<NumberWord, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
|
this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
|
||||||
for (int i = 0; i < blocks.size(); i++) {
|
for (int i = 0; i < blocks.size(); i++) {
|
||||||
@ -265,14 +292,14 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(Word number) {
|
public void add(NumberWord number) {
|
||||||
|
|
||||||
if (numberClusters.getElements().contains(number)) {
|
if (numberClusters.getElements().contains(number)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
numberClusters.addElement(number);
|
numberClusters.addElement(number);
|
||||||
for (Word element : numberClusters.getElements()) {
|
for (NumberWord element : numberClusters.getElements()) {
|
||||||
if (matches(number, element, lookup)) {
|
if (matches(number, element, lookup)) {
|
||||||
numberClusters.union(element, number);
|
numberClusters.union(element, number);
|
||||||
}
|
}
|
||||||
@ -280,73 +307,100 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<Word> getCurrentRightmostCluster() {
|
public List<NumberWord> getCurrentRightmostCluster() {
|
||||||
|
|
||||||
return numberClusters.getGroups()
|
return numberClusters.getGroups()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||||
.map(cluster -> cluster.stream()
|
.map(cluster -> cluster.stream()
|
||||||
.sorted(new TextPositionSequenceComparator(lookup))
|
.sorted(new TocNumberComparator(lookup))
|
||||||
.toList())
|
.toList())
|
||||||
.map(this::removeOutliers)
|
.map(this::removeOutliers)
|
||||||
// .map(this::filterByMinimumDensity)
|
.map(this::removeOnNonConsecutivePages)
|
||||||
|
.map(this::filterByWordNearTopOfPage)
|
||||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||||
.max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList());
|
.max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList());
|
||||||
}
|
}
|
||||||
|
|
||||||
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
|
|
||||||
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
|
|
||||||
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
|
|
||||||
//
|
|
||||||
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
|
|
||||||
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
|
|
||||||
//
|
|
||||||
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
|
|
||||||
// clustersPerPage.keySet()
|
|
||||||
// .stream()
|
|
||||||
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
|
|
||||||
// .forEach(page -> {
|
|
||||||
// var numbersOnPage = clustersPerPage.get(page);
|
|
||||||
//
|
|
||||||
// double height = numbersOnPage.stream()
|
|
||||||
// .map(BoundingBox::getBBox)
|
|
||||||
// .collect(RectangleTransformations.collectBBox()).getHeight();
|
|
||||||
//
|
|
||||||
// double count = numbersOnPage.size();
|
|
||||||
//
|
|
||||||
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
|
|
||||||
// result.addAll(numbers);
|
|
||||||
// }
|
|
||||||
// });
|
|
||||||
// return result;
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
private List<NumberWord> removeOnNonConsecutivePages(List<NumberWord> numbers) {
|
||||||
|
|
||||||
public List<Word> removeOutliers(List<Word> numbers) {
|
List<NumberWord> result = new ArrayList<>();
|
||||||
|
|
||||||
List<Word> result = new ArrayList<>();
|
|
||||||
|
|
||||||
result.add(numbers.get(0));
|
result.add(numbers.get(0));
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size(); i++) {
|
||||||
|
int prev = getPageNumber(numbers, i - 1);
|
||||||
|
int curr = getPageNumber(numbers, i);
|
||||||
|
|
||||||
|
if (Math.abs(prev - curr) > 1) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
result.add(numbers.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int getPageNumber(List<NumberWord> numbers, int i) {
|
||||||
|
|
||||||
|
return lookup.get(numbers.get(i)).page().getPageNumber();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<NumberWord> filterByWordNearTopOfPage(List<NumberWord> numbers) {
|
||||||
|
|
||||||
|
List<NumberWord> result = new ArrayList<>();
|
||||||
|
|
||||||
|
result.add(numbers.get(0));
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size(); i++) {
|
||||||
|
NumberWord prev = numbers.get(i - 1);
|
||||||
|
NumberWord curr = numbers.get(i);
|
||||||
|
ClassificationPage prevPage = lookup.get(prev).page();
|
||||||
|
ClassificationPage currPage = lookup.get(curr).page();
|
||||||
|
if (prevPage == currPage) {
|
||||||
|
result.add(curr);
|
||||||
|
} else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) {
|
||||||
|
result.add(curr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
|
||||||
|
|
||||||
|
List<NumberWord> confirmedClusterNumbers = new ArrayList<>();
|
||||||
|
|
||||||
|
confirmedClusterNumbers.add(numbers.get(0));
|
||||||
|
|
||||||
for (int i = 1; i < numbers.size() - 1; i++) {
|
for (int i = 1; i < numbers.size() - 1; i++) {
|
||||||
int prev = getNumberAsInt(numbers, i - 1);
|
int prev = getNumberAsInt(numbers, i - 1);
|
||||||
int curr = getNumberAsInt(numbers, i);
|
int curr = getNumberAsInt(numbers, i);
|
||||||
int next = getNumberAsInt(numbers, i + 1);
|
int next = getNumberAsInt(numbers, i + 1);
|
||||||
|
|
||||||
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
||||||
result.add(numbers.get(i));
|
confirmedClusterNumbers.add(numbers.get(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
|
if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) {
|
||||||
result.add(numbers.get(numbers.size() - 1));
|
confirmedClusterNumbers.add(numbers.get(numbers.size() - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return confirmedClusterNumbers;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static int getLatestNumber(List<NumberWord> confirmedClusterNumbers) {
|
||||||
|
|
||||||
|
return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Helper method to check if removing the current number results in a better order
|
// Helper method to check if removing the current number results in a better order
|
||||||
public static boolean isBetterWithout(List<Word> numbers, int i) {
|
public static boolean isBetterWithout(List<NumberWord> numbers, int i) {
|
||||||
|
|
||||||
if (i == 0 || i == numbers.size() - 1) {
|
if (i == 0 || i == numbers.size() - 1) {
|
||||||
return false;
|
return false;
|
||||||
@ -362,9 +416,9 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static int getNumberAsInt(List<Word> numbers, int i) {
|
private static int getNumberAsInt(List<NumberWord> numbers, int i) {
|
||||||
|
|
||||||
return Integer.parseInt(numbers.get(i).toString());
|
return numbers.get(i).number();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,13 +16,14 @@ import java.util.Optional;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
@ -35,10 +36,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||||
@ -65,7 +67,7 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
document.getPages()
|
document.getPages()
|
||||||
.forEach(context::buildAndAddPageWithCounter);
|
.forEach(context::buildAndAddPageWithCounter);
|
||||||
addSectionsForToC(layoutParsingType, document, context, documentGraph);
|
addSections(layoutParsingType, document, context, documentGraph);
|
||||||
addHeaderAndFooterToEachPage(document, context);
|
addHeaderAndFooterToEachPage(document, context);
|
||||||
|
|
||||||
documentGraph.setNumberOfPages(context.pages.size());
|
documentGraph.setNumberOfPages(context.pages.size());
|
||||||
@ -82,9 +84,9 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
documentGraph.streamAllSubNodes()
|
documentGraph.streamAllSubNodes()
|
||||||
.filter(SemanticNode::isLeaf)
|
.filter(SemanticNode::isLeaf)
|
||||||
.filter(node -> !node.getType().equals(NodeType.HEADER))
|
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.HEADER))
|
||||||
.filter(node -> !node.getType().equals(NodeType.FOOTER))
|
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.FOOTER))
|
||||||
.filter(node -> !node.getType().equals(NodeType.IMAGE))
|
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.IMAGE))
|
||||||
.map(SemanticNode::getTextBlock)
|
.map(SemanticNode::getTextBlock)
|
||||||
.map(TextBlock::getAtomicTextBlocks)
|
.map(TextBlock::getAtomicTextBlocks)
|
||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
@ -92,18 +94,18 @@ public class DocumentGraphFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addSectionsForToC(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
||||||
|
|
||||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
for (SectionTreeEntry sectionTreeEntry : classificationDocument.getSectionTree()) {
|
||||||
GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
GenericSemanticNode parent = sectionTreeEntry.getParent() == null ? null : sectionTreeEntry.getParent().getSection();
|
||||||
Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
|
Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
|
||||||
parent,
|
parent,
|
||||||
tocItem.getChildren().isEmpty(),
|
sectionTreeEntry.getType(),
|
||||||
tocItem.getNonEmptySectionBlocks(),
|
sectionTreeEntry.getNonEmptySectionBlocks(),
|
||||||
tocItem.getImages(),
|
sectionTreeEntry.getImages(),
|
||||||
context,
|
context,
|
||||||
document);
|
document);
|
||||||
tocItem.setSection(section.orElse(null));
|
sectionTreeEntry.setSection(section.orElse(null));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -121,6 +123,8 @@ public class DocumentGraphFactory {
|
|||||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||||
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
|
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
|
||||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
} else if (originalTextBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM)) {
|
||||||
|
node = TableOfContentsItem.builder().documentTree(context.getDocumentTree()).build();
|
||||||
} else {
|
} else {
|
||||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -17,7 +17,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
||||||
@ -29,7 +31,7 @@ public class SectionNodeFactory {
|
|||||||
|
|
||||||
public Optional<GenericSemanticNode> addSection(LayoutParsingType layoutParsingType,
|
public Optional<GenericSemanticNode> addSection(LayoutParsingType layoutParsingType,
|
||||||
GenericSemanticNode parentNode,
|
GenericSemanticNode parentNode,
|
||||||
boolean isLeaf,
|
SectionTreeEntry.Type type,
|
||||||
List<AbstractPageBlock> pageBlocks,
|
List<AbstractPageBlock> pageBlocks,
|
||||||
List<ClassifiedImage> images,
|
List<ClassifiedImage> images,
|
||||||
DocumentGraphFactory.Context context,
|
DocumentGraphFactory.Context context,
|
||||||
@ -48,12 +50,11 @@ public class SectionNodeFactory {
|
|||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
AbstractSemanticNode section;
|
AbstractSemanticNode section = switch (type) {
|
||||||
if (isLeaf) {
|
case SECTION -> Section.builder().documentTree(context.getDocumentTree()).build();
|
||||||
section = Section.builder().documentTree(context.getDocumentTree()).build();
|
case SUPER_SECTION -> SuperSection.builder().documentTree(context.getDocumentTree()).build();
|
||||||
} else {
|
case TOC_SECTION -> TableOfContents.builder().documentTree(context.getDocumentTree()).build();
|
||||||
section = SuperSection.builder().documentTree(context.getDocumentTree()).build();
|
};
|
||||||
}
|
|
||||||
|
|
||||||
context.getSections().add(section);
|
context.getSections().add(section);
|
||||||
|
|
||||||
@ -64,13 +65,14 @@ public class SectionNodeFactory {
|
|||||||
if (containsTablesAndTextBlocks) {
|
if (containsTablesAndTextBlocks) {
|
||||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
|
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
|
||||||
section,
|
section,
|
||||||
true,
|
SectionTreeEntry.Type.SECTION,
|
||||||
subSectionPageBlocks,
|
subSectionPageBlocks,
|
||||||
emptyList(),
|
emptyList(),
|
||||||
context,
|
context,
|
||||||
document));
|
document));
|
||||||
} else if (!isLeaf) {
|
} else if (type.equals(SectionTreeEntry.Type.SUPER_SECTION)) {
|
||||||
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
|
// If a SuperSection contains more blocks than just a headline, we add a Section which contains the remaining textblocks.
|
||||||
|
addSection(layoutParsingType, section, SectionTreeEntry.Type.SECTION, pageBlocks, emptyList(), context, document);
|
||||||
} else {
|
} else {
|
||||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
|
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,6 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
@ -120,7 +121,7 @@ public class TableNodeFactory {
|
|||||||
} else if (firstTextBlockIsHeadline(cell)) {
|
} else if (firstTextBlockIsHeadline(cell)) {
|
||||||
SectionNodeFactory.addSection(layoutParsingType,
|
SectionNodeFactory.addSection(layoutParsingType,
|
||||||
tableCell,
|
tableCell,
|
||||||
true,
|
SectionTreeEntry.Type.SECTION,
|
||||||
cell.getTextBlocks()
|
cell.getTextBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
.map(tb -> (AbstractPageBlock) tb)
|
.map(tb -> (AbstractPageBlock) tb)
|
||||||
|
|||||||
@ -12,6 +12,7 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
|
||||||
@ -20,7 +21,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -9,9 +8,9 @@ import java.util.NoSuchElementException;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||||
@ -26,6 +25,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||||
@ -70,13 +71,15 @@ public class DocumentGraphMapper {
|
|||||||
SemanticNode node = switch (entryData.getType()) {
|
SemanticNode node = switch (entryData.getType()) {
|
||||||
case SECTION -> buildSection(context);
|
case SECTION -> buildSection(context);
|
||||||
case SUPER_SECTION -> buildSuperSection(context);
|
case SUPER_SECTION -> buildSuperSection(context);
|
||||||
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
|
case PARAGRAPH -> buildParagraph(context, entryData.getPropertiesMap());
|
||||||
case HEADLINE -> buildHeadline(context);
|
case HEADLINE -> buildHeadline(context);
|
||||||
case HEADER -> buildHeader(context);
|
case HEADER -> buildHeader(context);
|
||||||
case FOOTER -> buildFooter(context);
|
case FOOTER -> buildFooter(context);
|
||||||
case TABLE -> buildTable(context, entryData.getProperties());
|
case TABLE -> buildTable(context, entryData.getPropertiesMap());
|
||||||
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
|
case TABLE_CELL -> buildTableCell(context, entryData.getPropertiesMap());
|
||||||
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList());
|
case IMAGE -> buildImage(context, entryData.getPropertiesMap(), entryData.getPageNumbersList());
|
||||||
|
case TABLE_OF_CONTENTS -> buildTableOfContents(context);
|
||||||
|
case TABLE_OF_CONTENTS_ITEM -> buildTableOfContentsItem(context);
|
||||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -100,6 +103,18 @@ public class DocumentGraphMapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static SemanticNode buildTableOfContents(Context context) {
|
||||||
|
|
||||||
|
return TableOfContents.builder().documentTree(context.documentTree).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static SemanticNode buildTableOfContentsItem(Context context) {
|
||||||
|
|
||||||
|
return TableOfContentsItem.builder().documentTree(context.documentTree).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private Headline buildHeadline(Context context) {
|
private Headline buildHeadline(Context context) {
|
||||||
|
|
||||||
return Headline.builder().documentTree(context.documentTree).build();
|
return Headline.builder().documentTree(context.documentTree).build();
|
||||||
@ -182,13 +197,11 @@ public class DocumentGraphMapper {
|
|||||||
|
|
||||||
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
||||||
|
|
||||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.getDocumentTextDataList()
|
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.getDocumentTextDataList().get(Math.toIntExact(atomicTextBlockId)),
|
||||||
.get(Math.toIntExact(atomicTextBlockId)),
|
context.atomicPositionBlockData.getDocumentPositionDataList().get(Math.toIntExact(atomicTextBlockId)),
|
||||||
context.atomicPositionBlockData.getDocumentPositionDataList()
|
|
||||||
.get(Math.toIntExact(atomicTextBlockId)),
|
|
||||||
parent,
|
parent,
|
||||||
getPage(context.documentTextDataBlockData.getDocumentTextDataList()
|
getPage(context.documentTextDataBlockData.getDocumentTextDataList().get(Math.toIntExact(atomicTextBlockId)).getPage(),
|
||||||
.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
context));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -38,6 +38,7 @@ public class LayoutGridService {
|
|||||||
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
|
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
|
||||||
|
|
||||||
document.getLayoutDebugLayer().addSentenceVisualization(document.getTextBlock());
|
document.getLayoutDebugLayer().addSentenceVisualization(document.getTextBlock());
|
||||||
|
document.getLayoutDebugLayer().addOutlineHeadlines(document);
|
||||||
|
|
||||||
if (document.getLayoutDebugLayer().isActive()) {
|
if (document.getLayoutDebugLayer().isActive()) {
|
||||||
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()), outline);
|
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()), outline);
|
||||||
@ -54,12 +55,13 @@ public class LayoutGridService {
|
|||||||
.peek(layoutGrid::addTreeId)
|
.peek(layoutGrid::addTreeId)
|
||||||
.forEach(semanticNode -> {
|
.forEach(semanticNode -> {
|
||||||
switch (semanticNode.getType()) {
|
switch (semanticNode.getType()) {
|
||||||
case SECTION, SUPER_SECTION -> layoutGrid.addSection(semanticNode);
|
case SECTION, SUPER_SECTION, TABLE_OF_CONTENTS -> layoutGrid.addSection(semanticNode);
|
||||||
case HEADLINE -> layoutGrid.addHeadline((Headline) semanticNode);
|
case HEADLINE -> layoutGrid.addHeadline((Headline) semanticNode);
|
||||||
case PARAGRAPH -> layoutGrid.addParagraph((Paragraph) semanticNode);
|
case PARAGRAPH -> layoutGrid.addParagraph((Paragraph) semanticNode);
|
||||||
case TABLE -> layoutGrid.addTable((Table) semanticNode);
|
case TABLE -> layoutGrid.addTable((Table) semanticNode);
|
||||||
case IMAGE -> layoutGrid.addImage((Image) semanticNode);
|
case IMAGE -> layoutGrid.addImage((Image) semanticNode);
|
||||||
case HEADER, FOOTER -> layoutGrid.addHeaderOrFooter(semanticNode);
|
case HEADER, FOOTER -> layoutGrid.addHeaderOrFooter(semanticNode);
|
||||||
|
case TABLE_OF_CONTENTS_ITEM -> layoutGrid.addTableOfContentsItem(semanticNode);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return layoutGrid;
|
return layoutGrid;
|
||||||
|
|||||||
@ -111,8 +111,8 @@ public class PdfVisualisationUtility {
|
|||||||
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||||
case HEADER, FOOTER -> Color.GREEN;
|
case HEADER, FOOTER -> Color.GREEN;
|
||||||
case PARAGRAPH -> Color.BLUE;
|
case PARAGRAPH, TABLE_OF_CONTENTS_ITEM -> Color.BLUE;
|
||||||
case SUPER_SECTION, SECTION -> Color.BLACK;
|
case SUPER_SECTION, SECTION, TABLE_OF_CONTENTS -> Color.BLACK;
|
||||||
case HEADLINE -> Color.RED;
|
case HEADLINE -> Color.RED;
|
||||||
case TABLE -> Color.ORANGE;
|
case TABLE -> Color.ORANGE;
|
||||||
case TABLE_CELL -> Color.GRAY;
|
case TABLE_CELL -> Color.GRAY;
|
||||||
|
|||||||
@ -15,19 +15,25 @@ import java.util.Set;
|
|||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
|
import org.checkerframework.checker.units.qual.C;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
@ -293,7 +299,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addTocPages(List<Word> numbers, int page) {
|
public void addTocPages(List<NumberWord> numbers, int page) {
|
||||||
|
|
||||||
if (!active) {
|
if (!active) {
|
||||||
return;
|
return;
|
||||||
@ -302,13 +308,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
|
||||||
visualizationsOnPage.getColoredRectangles()
|
visualizationsOnPage.getColoredRectangles()
|
||||||
.addAll(numbers.stream()
|
.addAll(numbers.stream()
|
||||||
|
.map(NumberWord::word)
|
||||||
.map(BoundingBox::getBBoxPdf)
|
.map(BoundingBox::getBBoxPdf)
|
||||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||||
.toList());
|
.toList());
|
||||||
visualizationsOnPage.getColoredRectangles()
|
|
||||||
.add(new ColoredRectangle(numbers.stream()
|
|
||||||
.map(BoundingBox::getBBoxPdf)
|
|
||||||
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -332,8 +335,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
|
|
||||||
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
|
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
int rectSize = 5;
|
int rectSize = 5;
|
||||||
|
|
||||||
Point2D point2D;
|
Point2D point2D;
|
||||||
if (outlineObject.getPoint().isPresent()) {
|
if (outlineObject.getPoint().isPresent()) {
|
||||||
point2D = outlineObject.getPoint().get();
|
point2D = outlineObject.getPoint().get();
|
||||||
@ -357,10 +362,40 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
|
|
||||||
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
for (ListIdentifier listIdentifier : listIdentifiers) {
|
for (ListIdentifier listIdentifier : listIdentifiers) {
|
||||||
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
||||||
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addTocBlocks(Set<TextBlockOnPage> blocksWithNumberInCluster) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (TextBlockOnPage textBlockOnPage : blocksWithNumberInCluster) {
|
||||||
|
getOrCreateVisualizationsOnPage(textBlockOnPage.page().getPageNumber(), this.tocBlocks).getColoredRectangles()
|
||||||
|
.add(new ColoredRectangle(textBlockOnPage.textBlock().getBBoxPdf(), TOC_COLOR, LINE_WIDTH));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addOutlineHeadlines(Document document) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
document.streamAllSubNodes()
|
||||||
|
.filter(node -> node.getType().equals(NodeTypeProto.NodeType.HEADLINE))
|
||||||
|
.filter(node -> node.getEngines().contains(LayoutEngineProto.LayoutEngine.OUTLINE))
|
||||||
|
.forEach(headline -> headline.getBBox()
|
||||||
|
.forEach((page, bbox) -> getOrCreateVisualizationsOnPage(page.getNumber(), this.outlineHeadlines).getColoredRectangles()
|
||||||
|
.add(new ColoredRectangle(bbox, HEADLINE_COLOR, LINE_WIDTH))));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
|
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||||
@ -72,10 +73,12 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
|
|
||||||
public void addHeadline(Headline headline) {
|
public void addHeadline(Headline headline) {
|
||||||
|
|
||||||
addAsRectangle(headline, headlines, HEADLINE_COLOR);
|
if (headline.getParent().getType().equals(NodeTypeProto.NodeType.TABLE_OF_CONTENTS)) {
|
||||||
if (headline.getEngines().contains(LayoutEngine.OUTLINE)) {
|
addAsRectangle(headline, toc, HEADLINE_COLOR);
|
||||||
addAsRectangle(headline, outlineHeadlines, HEADLINE_COLOR);
|
} else {
|
||||||
|
addAsRectangle(headline, headlines, HEADLINE_COLOR);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -88,19 +91,10 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
public void addTreeId(SemanticNode semanticNode) {
|
public void addTreeId(SemanticNode semanticNode) {
|
||||||
|
|
||||||
Page page = semanticNode.getFirstPage();
|
Page page = semanticNode.getFirstPage();
|
||||||
if (semanticNode.getBBox()
|
if (semanticNode.getBBox().get(page) == null) {
|
||||||
.get(page) == null) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
addPlacedText(page,
|
addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR);
|
||||||
semanticNode.getBBox()
|
|
||||||
.get(page),
|
|
||||||
semanticNode.getBBox()
|
|
||||||
.get(page),
|
|
||||||
buildTreeIdString(semanticNode),
|
|
||||||
1,
|
|
||||||
treeIds,
|
|
||||||
TREEID_COLOR);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -124,20 +118,19 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
public void addSection(SemanticNode section) {
|
public void addSection(SemanticNode section) {
|
||||||
|
|
||||||
Map<Page, Rectangle2D> bBoxMap = section.getBBox();
|
Map<Page, Rectangle2D> bBoxMap = section.getBBox();
|
||||||
|
Color color = section.getType().equals(NodeTypeProto.NodeType.TABLE_OF_CONTENTS) ? TOC_COLOR : SECTION_COLOR;
|
||||||
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeTypeProto.NodeType.SECTION)
|
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeTypeProto.NodeType.SECTION)
|
||||||
.toList();
|
.toList();
|
||||||
Integer maxChildDepth = subSections.stream()
|
Integer maxChildDepth = subSections.stream()
|
||||||
.map(node -> node.getTreeId().size())
|
.map(node -> node.getTreeId().size())
|
||||||
.max(Integer::compareTo)
|
.max(Integer::compareTo).orElse(section.getTreeId().size());
|
||||||
.orElse(section.getTreeId().size());
|
|
||||||
int ownDepth = section.getTreeId().size();
|
int ownDepth = section.getTreeId().size();
|
||||||
|
|
||||||
Page firstPage = section.getFirstPage();
|
Page firstPage = section.getFirstPage();
|
||||||
String treeIdString = buildTreeIdString(section);
|
String treeIdString = buildTreeIdString(section);
|
||||||
|
|
||||||
if (bBoxMap.values().size() == 1) {
|
if (bBoxMap.values().size() == 1) {
|
||||||
handleSinglePage(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
|
handleSinglePage(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth, color);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
List<Page> pagesInOrder = bBoxMap.keySet()
|
List<Page> pagesInOrder = bBoxMap.keySet()
|
||||||
@ -145,12 +138,12 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
.sorted(Comparator.comparingInt(Page::getNumber))
|
.sorted(Comparator.comparingInt(Page::getNumber))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
pagesInOrder.remove(0);
|
pagesInOrder.remove(0);
|
||||||
handleFirstPageOfSection(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
|
handleFirstPageOfSection(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth, color);
|
||||||
for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) {
|
for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) {
|
||||||
handleForMiddlePageOfSection(section, middlePage, bBoxMap.get(middlePage), treeIdString, maxChildDepth, ownDepth);
|
handleForMiddlePageOfSection(section, middlePage, bBoxMap.get(middlePage), treeIdString, maxChildDepth, ownDepth, color);
|
||||||
}
|
}
|
||||||
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
|
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
|
||||||
handleLastPageOfSection(section, lastPage, bBoxMap.get(lastPage), treeIdString, maxChildDepth, ownDepth);
|
handleLastPageOfSection(section, lastPage, bBoxMap.get(lastPage), treeIdString, maxChildDepth, ownDepth, color);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -232,33 +225,45 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void handleSinglePage(SemanticNode semanticNode, Page page, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
|
private void handleSinglePage(SemanticNode semanticNode, Page page, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth, Color color) {
|
||||||
|
|
||||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
||||||
// add string to top line
|
// add string to top line
|
||||||
var firstLine = result.pageLines().remove(0);
|
var firstLine = result.pageLines().remove(0);
|
||||||
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH));
|
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
|
||||||
for (Line2D line : result.pageLines()) {
|
for (Line2D line : result.pageLines()) {
|
||||||
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
|
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void handleFirstPageOfSection(SemanticNode semanticNode, Page firstPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
|
private void handleFirstPageOfSection(SemanticNode semanticNode,
|
||||||
|
Page firstPage,
|
||||||
|
Rectangle2D rectangle2D,
|
||||||
|
String treeIdString,
|
||||||
|
Integer maxChildDepth,
|
||||||
|
Integer ownDepth,
|
||||||
|
Color color) {
|
||||||
|
|
||||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
||||||
// remove bottom line
|
// remove bottom line
|
||||||
result.pageLines().remove(2);
|
result.pageLines().remove(2);
|
||||||
// add string to top line
|
// add string to top line
|
||||||
var firstLine = result.pageLines().remove(0);
|
var firstLine = result.pageLines().remove(0);
|
||||||
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH));
|
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
|
||||||
for (Line2D line : result.pageLines()) {
|
for (Line2D line : result.pageLines()) {
|
||||||
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
|
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void handleForMiddlePageOfSection(SemanticNode semanticNode, Page middlePage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
|
private void handleForMiddlePageOfSection(SemanticNode semanticNode,
|
||||||
|
Page middlePage,
|
||||||
|
Rectangle2D rectangle2D,
|
||||||
|
String treeIdString,
|
||||||
|
Integer maxChildDepth,
|
||||||
|
Integer ownDepth,
|
||||||
|
Color color) {
|
||||||
|
|
||||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
||||||
// remove top line
|
// remove top line
|
||||||
@ -267,23 +272,29 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
result.pageLines().remove(1);
|
result.pageLines().remove(1);
|
||||||
// add string to left line
|
// add string to left line
|
||||||
var leftLine = result.pageLines().remove(1);
|
var leftLine = result.pageLines().remove(1);
|
||||||
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH));
|
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
|
||||||
for (Line2D line : result.pageLines()) {
|
for (Line2D line : result.pageLines()) {
|
||||||
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
|
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void handleLastPageOfSection(SemanticNode semanticNode, Page lastPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
|
private void handleLastPageOfSection(SemanticNode semanticNode,
|
||||||
|
Page lastPage,
|
||||||
|
Rectangle2D rectangle2D,
|
||||||
|
String treeIdString,
|
||||||
|
Integer maxChildDepth,
|
||||||
|
Integer ownDepth,
|
||||||
|
Color color) {
|
||||||
|
|
||||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
||||||
// remove top line
|
// remove top line
|
||||||
result.pageLines().remove(0);
|
result.pageLines().remove(0);
|
||||||
// add string to left line
|
// add string to left line
|
||||||
var leftLine = result.pageLines().remove(2);
|
var leftLine = result.pageLines().remove(2);
|
||||||
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH));
|
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
|
||||||
for (Line2D line : result.pageLines()) {
|
for (Line2D line : result.pageLines()) {
|
||||||
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
|
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -295,14 +306,14 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
Integer maxChildDepth,
|
Integer maxChildDepth,
|
||||||
Integer ownDepth) {
|
Integer ownDepth) {
|
||||||
|
|
||||||
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), sections).getColoredLines();
|
Visualizations visualizations = semanticNode.getType().equals(NodeTypeProto.NodeType.TABLE_OF_CONTENTS) ? toc : sections;
|
||||||
|
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredLines();
|
||||||
int lineWidthModifier = maxChildDepth - ownDepth;
|
int lineWidthModifier = maxChildDepth - ownDepth;
|
||||||
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
|
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
|
||||||
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
|
|
||||||
|
|
||||||
SemanticNode highestParent = semanticNode.getHighestParent();
|
SemanticNode highestParent = semanticNode.getHighestParent();
|
||||||
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
|
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
|
||||||
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, maxChildDepth, sections, SECTION_COLOR);
|
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, maxChildDepth, visualizations, SECTION_COLOR);
|
||||||
var lastPageLines = createLinesFromRectangle(r, page.getRotation());
|
var lastPageLines = createLinesFromRectangle(r, page.getRotation());
|
||||||
|
|
||||||
if (semanticNode instanceof SuperSection) {
|
if (semanticNode instanceof SuperSection) {
|
||||||
@ -347,8 +358,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
List<Double> ys = yStream.collect(Collectors.toList());
|
List<Double> ys = yStream.collect(Collectors.toList());
|
||||||
ys.remove(0);
|
ys.remove(0);
|
||||||
|
|
||||||
Rectangle2D tableBBox = table.getBBox()
|
Rectangle2D tableBBox = table.getBBox().get(page);
|
||||||
.get(page);
|
|
||||||
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines();
|
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines();
|
||||||
|
|
||||||
xs.forEach(x -> {
|
xs.forEach(x -> {
|
||||||
@ -384,6 +394,12 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addTableOfContentsItem(SemanticNode semanticNode) {
|
||||||
|
|
||||||
|
addAsRectangle(semanticNode, toc, PARAGRAPH_COLOR);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
|
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -45,7 +45,6 @@ dependencies {
|
|||||||
// for integration testing only
|
// for integration testing only
|
||||||
testImplementation(project(":viewer-doc-processor"))
|
testImplementation(project(":viewer-doc-processor"))
|
||||||
testImplementation(project(":layoutparser-service-internal-api"))
|
testImplementation(project(":layoutparser-service-internal-api"))
|
||||||
testImplementation("com.google.protobuf:protobuf-java-util:4.27.1")
|
|
||||||
|
|
||||||
testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
|
testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
|
||||||
testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}")
|
testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}")
|
||||||
|
|||||||
@ -83,4 +83,11 @@ class SectionIdentifierTest {
|
|||||||
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
|
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFalsePositive111() {
|
||||||
|
SectionIdentifier identifier = SectionIdentifier.fromSearchText("111: Headline");
|
||||||
|
assertEquals(SectionIdentifier.Format.NUMERICAL, identifier.getFormat());
|
||||||
|
assertEquals(1, identifier.level());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -10,7 +10,6 @@ import java.util.Collection;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.function.Predicate;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
@ -28,7 +27,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
@ -100,10 +99,10 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
.allMatch(OutlineObject::isFound));
|
.allMatch(OutlineObject::isFound));
|
||||||
|
|
||||||
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
|
SectionTree sectionTree = classificationDocument.getSectionTree();
|
||||||
|
|
||||||
assertEquals(tableOfContents.getMainSections().size(), 9);
|
assertEquals(sectionTree.getMainSections().size(), 9);
|
||||||
assertEquals(tableOfContents.getMainSections().subList(1, 9)
|
assertEquals(sectionTree.getMainSections().subList(1, 9)
|
||||||
.stream()
|
.stream()
|
||||||
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
|
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
|
||||||
.toList(),
|
.toList(),
|
||||||
@ -121,14 +120,14 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
// assertEquals(tableOfContents.getMainSections().get(6).getImages().size(), 1);
|
// assertEquals(tableOfContents.getMainSections().get(6).getImages().size(), 1);
|
||||||
// assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().get(2).getImages().size(), 1);
|
// assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().get(2).getImages().size(), 1);
|
||||||
|
|
||||||
assertTrue(tableOfContents.getAllTableOfContentItems()
|
assertTrue(sectionTree.getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() != null));
|
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() != null));
|
||||||
assertTrue(tableOfContents.getAllTableOfContentItems()
|
assertTrue(sectionTree.getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(tableOfContentItem -> tableOfContentItem.getChildren().isEmpty())
|
.filter(tableOfContentItem -> tableOfContentItem.getChildren().isEmpty())
|
||||||
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof Section));
|
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof Section));
|
||||||
assertTrue(tableOfContents.getAllTableOfContentItems()
|
assertTrue(sectionTree.getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(tableOfContentItem -> !tableOfContentItem.getChildren().isEmpty())
|
.filter(tableOfContentItem -> !tableOfContentItem.getChildren().isEmpty())
|
||||||
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof SuperSection));
|
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof SuperSection));
|
||||||
|
|||||||
@ -150,14 +150,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
assertThat(document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()).isNotEmpty();
|
.toList()).isNotEmpty();
|
||||||
var tables = document.getTableOfContents().getAllTableOfContentItems()
|
var tables = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -203,14 +203,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
assertThat(document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()).isNotEmpty();
|
.toList()).isNotEmpty();
|
||||||
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
TablePageBlock table = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -233,14 +233,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
|
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
assertThat(document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()).isNotEmpty();
|
.toList()).isNotEmpty();
|
||||||
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
TablePageBlock firstTable = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -250,7 +250,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
.get(0);
|
.get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||||
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -280,14 +280,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
assertThat(document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()).isNotEmpty();
|
.toList()).isNotEmpty();
|
||||||
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
TablePageBlock firstTable = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -297,7 +297,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
.get(0);
|
.get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||||
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -327,14 +327,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
|
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
assertThat(document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()).isNotEmpty();
|
.toList()).isNotEmpty();
|
||||||
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
TablePageBlock firstTable = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -344,7 +344,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
.get(0);
|
.get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||||
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -844,7 +844,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void toHtml(ClassificationDocument document, String filename) {
|
private void toHtml(ClassificationDocument document, String filename) {
|
||||||
|
|
||||||
var tables = document.getTableOfContents().getAllTableOfContentItems()
|
var tables = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -871,7 +871,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||||
|
|
||||||
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
TablePageBlock table = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -901,7 +901,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||||
|
|
||||||
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
TablePageBlock table = document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
@ -929,7 +929,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||||
|
|
||||||
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
assertThat(document.getSectionTree().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
|
|||||||
@ -18,8 +18,8 @@ import org.springframework.core.io.ClassPathResource;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
|||||||
@ -227,9 +227,9 @@ public class PdfDraw {
|
|||||||
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||||
case HEADER, FOOTER -> Color.GREEN;
|
case HEADER, FOOTER -> Color.GREEN;
|
||||||
case PARAGRAPH -> Color.BLUE;
|
case PARAGRAPH, TABLE_OF_CONTENTS_ITEM -> Color.BLUE;
|
||||||
case HEADLINE -> Color.RED;
|
case HEADLINE -> Color.RED;
|
||||||
case SECTION, SUPER_SECTION -> Color.BLACK;
|
case SECTION, SUPER_SECTION, TABLE_OF_CONTENTS -> Color.BLACK;
|
||||||
case TABLE -> Color.ORANGE;
|
case TABLE -> Color.ORANGE;
|
||||||
case TABLE_CELL -> Color.GRAY;
|
case TABLE_CELL -> Color.GRAY;
|
||||||
case IMAGE -> Color.MAGENTA;
|
case IMAGE -> Color.MAGENTA;
|
||||||
|
|||||||
@ -40,7 +40,7 @@ public record LayerIdentifier(String name, String markedContentName) {
|
|||||||
public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES");
|
public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES");
|
||||||
public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES");
|
public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES");
|
||||||
public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs");
|
public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs");
|
||||||
public static final LayerIdentifier OUTLINE_HEADLINES = new LayerIdentifier("Outline Headlines", "OUTLINE_HEADLINES");
|
public static final LayerIdentifier KNECON_LAYOUT_TOC = new LayerIdentifier("Table of Contents", "TABLE_OF_CONTENTS");
|
||||||
|
|
||||||
//layout grid debug
|
//layout grid debug
|
||||||
public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT");
|
public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT");
|
||||||
@ -55,8 +55,10 @@ public record LayerIdentifier(String name, String markedContentName) {
|
|||||||
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
|
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
|
||||||
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
|
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
|
||||||
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||||
|
public static final LayerIdentifier OUTLINE_HEADLINES = new LayerIdentifier("Outline Headlines", "OUTLINE_HEADLINES");
|
||||||
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
||||||
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
||||||
|
public static final LayerIdentifier TOC_BLOCKS = new LayerIdentifier("TOC blocks", "TOC_BLOCKS");
|
||||||
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
||||||
|
|
||||||
// Visual layout parser
|
// Visual layout parser
|
||||||
|
|||||||
@ -22,6 +22,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
|
|
||||||
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||||
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||||
|
protected static final Color TOC_COLOR = new Color(33, 159, 144);
|
||||||
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||||
|
|
||||||
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||||
@ -31,6 +32,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
protected static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
|
protected static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
|
||||||
protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
|
protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
|
||||||
|
|
||||||
|
protected static final Color HEADLINE_COLOR = new Color(162, 56, 56);
|
||||||
|
|
||||||
protected static final Color CELLS_COLOR = new Color(31, 214, 27);
|
protected static final Color CELLS_COLOR = new Color(31, 214, 27);
|
||||||
protected static final Color OUTLINE_OBJECT_COLOR = new Color(214, 27, 183);
|
protected static final Color OUTLINE_OBJECT_COLOR = new Color(214, 27, 183);
|
||||||
|
|
||||||
@ -59,8 +62,9 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||||
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
||||||
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
||||||
|
protected final Visualizations tocBlocks = Visualizations.builder().layer(LayerIdentifier.TOC_BLOCKS).build();
|
||||||
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
||||||
|
protected final Visualizations outlineHeadlines = Visualizations.builder().layer(LayerIdentifier.OUTLINE_HEADLINES).build();
|
||||||
|
|
||||||
public List<Visualizations> getVisualizations() {
|
public List<Visualizations> getVisualizations() {
|
||||||
|
|
||||||
@ -76,10 +80,11 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
mainBody, //
|
mainBody, //
|
||||||
markedContent, //
|
markedContent, //
|
||||||
outlineObjects, //
|
outlineObjects, //
|
||||||
|
outlineHeadlines, //
|
||||||
tocPages, //
|
tocPages, //
|
||||||
|
tocBlocks, //
|
||||||
listIdentifiers //
|
listIdentifiers //
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -34,8 +34,10 @@ public class LayoutGridLayerConfig extends AbstractLayerGroup {
|
|||||||
protected static final Color KEY_VALUE_BBOX_COLOR = new Color(0, 39, 85);
|
protected static final Color KEY_VALUE_BBOX_COLOR = new Color(0, 39, 85);
|
||||||
protected static final Color KEY_COLOR = new Color(30, 92, 172);
|
protected static final Color KEY_COLOR = new Color(30, 92, 172);
|
||||||
protected static final Color VALUE_COLOR = new Color(30, 172, 146);
|
protected static final Color VALUE_COLOR = new Color(30, 172, 146);
|
||||||
|
protected static final Color TOC_COLOR = new Color(0, 86, 198);
|
||||||
|
|
||||||
protected final Visualizations sections = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_SECTION).visibleByDefault(true).build();
|
protected final Visualizations sections = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_SECTION).visibleByDefault(true).build();
|
||||||
|
protected final Visualizations toc = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TOC).visibleByDefault(true).build();
|
||||||
protected final Visualizations paragraphs = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_PARAGRAPH).visibleByDefault(true).build();
|
protected final Visualizations paragraphs = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_PARAGRAPH).visibleByDefault(true).build();
|
||||||
protected final Visualizations headlines = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_HEADLINE).visibleByDefault(true).build();
|
protected final Visualizations headlines = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_HEADLINE).visibleByDefault(true).build();
|
||||||
protected final Visualizations tables = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TABLE).visibleByDefault(true).build();
|
protected final Visualizations tables = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TABLE).visibleByDefault(true).build();
|
||||||
@ -44,12 +46,12 @@ public class LayoutGridLayerConfig extends AbstractLayerGroup {
|
|||||||
protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build();
|
protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build();
|
||||||
protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build();
|
protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build();
|
||||||
protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build();
|
protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build();
|
||||||
protected final Visualizations outlineHeadlines = Visualizations.builder().layer(LayerIdentifier.OUTLINE_HEADLINES).build();
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Visualizations> getVisualizations() {
|
public List<Visualizations> getVisualizations() {
|
||||||
|
|
||||||
return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds, outlineHeadlines);
|
return List.of(headlines, paragraphs, tables, sections, headerFooter, toc, keyValue, figures, images, treeIds);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user