Compare commits

...

3 Commits

Author SHA1 Message Date
Kilian Schuettler
4b86307936 RED-9139: add new TableOfContents Node
* rename previous TableOfContent to SectionTree
* added protobuf compile script
2024-11-08 14:40:54 +01:00
Kilian Schuettler
ce41014d4b RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
2024-11-08 12:15:56 +01:00
Kilian Schuettler
e6cd889444 RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
2024-11-08 10:41:56 +01:00
52 changed files with 10070 additions and 11347 deletions

View File

@ -7,5 +7,5 @@ description = "layoutparser-service-internal-api"
dependencies {
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
api("com.google.protobuf:protobuf-java-util:4.28.3")
}

View File

@ -1,16 +1,14 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import java.awt.geom.Rectangle2D;
import java.io.ObjectStreamException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Getter;

View File

@ -1,193 +1,177 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: LayoutEngine.proto
// Protobuf Java Version: 4.27.1
@SuppressWarnings("all")
// Protobuf Java Version: 4.28.3
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
public final class LayoutEngineProto {
private LayoutEngineProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
LayoutEngineProto.class.getName());
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistryLite registry) {
}
private LayoutEngineProto() {}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions(
(com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code LayoutEngine}
*/
public enum LayoutEngine
implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>ALGORITHM = 0;</code>
*/
ALGORITHM(0),
/**
* <code>AI = 1;</code>
*/
AI(1),
/**
* <code>OUTLINE = 2;</code>
*/
OUTLINE(2),
UNRECOGNIZED(-1),
;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", LayoutEngineProto.class.getName());
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
LayoutEngine.class.getName());
}
/**
* <code>ALGORITHM = 0;</code>
*/
public static final int ALGORITHM_VALUE = 0;
/**
* <code>AI = 1;</code>
*/
public static final int AI_VALUE = 1;
/**
* <code>OUTLINE = 2;</code>
*/
public static final int OUTLINE_VALUE = 2;
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new java.lang.IllegalArgumentException(
"Can't get the number of an unknown enum value.");
}
return value;
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code LayoutEngine}
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
public enum LayoutEngine implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>ALGORITHM = 0;</code>
*/
ALGORITHM(0),
/**
* <code>AI = 1;</code>
*/
AI(1),
/**
* <code>OUTLINE = 2;</code>
*/
OUTLINE(2),
UNRECOGNIZED(-1),
;
@java.lang.Deprecated
public static LayoutEngine valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static LayoutEngine forNumber(int value) {
switch (value) {
case 0: return ALGORITHM;
case 1: return AI;
case 2: return OUTLINE;
default: return null;
}
}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", LayoutEngine.class.getName());
}
/**
* <code>ALGORITHM = 0;</code>
*/
public static final int ALGORITHM_VALUE = 0;
/**
* <code>AI = 1;</code>
*/
public static final int AI_VALUE = 1;
/**
* <code>OUTLINE = 2;</code>
*/
public static final int OUTLINE_VALUE = 2;
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
}
return value;
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
@Deprecated
public static LayoutEngine valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static LayoutEngine forNumber(int value) {
switch (value) {
case 0:
return ALGORITHM;
case 1:
return AI;
case 2:
return OUTLINE;
default:
return null;
}
}
public static com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<LayoutEngine> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>() {
public static com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>
internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<
LayoutEngine> internalValueMap =
new com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>() {
public LayoutEngine findValueByNumber(int number) {
return LayoutEngine.forNumber(number);
return LayoutEngine.forNumber(number);
}
};
};
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues()
.get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
return LayoutEngineProto.getDescriptor().getEnumTypes()
.get(0);
}
private static final LayoutEngine[] VALUES = values();
public static LayoutEngine valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private LayoutEngine(int value) {
this.value = value;
}
// @@protoc_insertion_point(enum_scope:LayoutEngine)
public final com.google.protobuf.Descriptors.EnumValueDescriptor
getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new java.lang.IllegalStateException(
"Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues().get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor
getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor
getDescriptor() {
return com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.getDescriptor().getEnumTypes().get(0);
}
private static final LayoutEngine[] VALUES = values();
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
return descriptor;
public static LayoutEngine valueOf(
com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new java.lang.IllegalArgumentException(
"EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
static {
String[] descriptorData = {"\n\022LayoutEngine.proto*2\n\014LayoutEngine\022\r\n\t" + "ALGORITHM\020\000\022\006\n\002AI\020\001\022\013\n\007OUTLINE\020\002b\006proto3"};
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
descriptor.resolveAllFeaturesImmutable();
private LayoutEngine(int value) {
this.value = value;
}
// @@protoc_insertion_point(outer_class_scope)
// @@protoc_insertion_point(enum_scope:LayoutEngine)
}
public static com.google.protobuf.Descriptors.FileDescriptor
getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor
descriptor;
static {
java.lang.String[] descriptorData = {
"\n\022LayoutEngine.proto*2\n\014LayoutEngine\022\r\n\t" +
"ALGORITHM\020\000\022\006\n\002AI\020\001\022\013\n\007OUTLINE\020\002B[\nFcom." +
"knecon.fforesight.service.layoutparser.i" +
"nternal.api.data.redactionB\021LayoutEngine" +
"Protob\006proto3"
};
descriptor = com.google.protobuf.Descriptors.FileDescriptor
.internalBuildGeneratedFileFrom(descriptorData,
new com.google.protobuf.Descriptors.FileDescriptor[] {
});
descriptor.resolveAllFeaturesImmutable();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -1,274 +1,261 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.util.Locale;
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: NodeType.proto
// Protobuf Java Version: 4.27.1
@SuppressWarnings("all")
// Protobuf Java Version: 4.28.3
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
public final class NodeTypeProto {
private NodeTypeProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
NodeTypeProto.class.getName());
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistryLite registry) {
}
private NodeTypeProto() {}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions(
(com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code NodeType}
*/
public enum NodeType
implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>DOCUMENT = 0;</code>
*/
DOCUMENT(0),
/**
* <code>SECTION = 1;</code>
*/
SECTION(1),
/**
* <code>SUPER_SECTION = 2;</code>
*/
SUPER_SECTION(2),
/**
* <code>HEADLINE = 3;</code>
*/
HEADLINE(3),
/**
* <code>PARAGRAPH = 4;</code>
*/
PARAGRAPH(4),
/**
* <code>TABLE = 5;</code>
*/
TABLE(5),
/**
* <code>TABLE_CELL = 6;</code>
*/
TABLE_CELL(6),
/**
* <code>IMAGE = 7;</code>
*/
IMAGE(7),
/**
* <code>HEADER = 8;</code>
*/
HEADER(8),
/**
* <code>FOOTER = 9;</code>
*/
FOOTER(9),
/**
* <code>TABLE_OF_CONTENTS = 10;</code>
*/
TABLE_OF_CONTENTS(10),
/**
* <code>TABLE_OF_CONTENTS_ITEM = 11;</code>
*/
TABLE_OF_CONTENTS_ITEM(11),
UNRECOGNIZED(-1),
;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", NodeTypeProto.class.getName());
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
NodeType.class.getName());
}
/**
* <code>DOCUMENT = 0;</code>
*/
public static final int DOCUMENT_VALUE = 0;
/**
* <code>SECTION = 1;</code>
*/
public static final int SECTION_VALUE = 1;
/**
* <code>SUPER_SECTION = 2;</code>
*/
public static final int SUPER_SECTION_VALUE = 2;
/**
* <code>HEADLINE = 3;</code>
*/
public static final int HEADLINE_VALUE = 3;
/**
* <code>PARAGRAPH = 4;</code>
*/
public static final int PARAGRAPH_VALUE = 4;
/**
* <code>TABLE = 5;</code>
*/
public static final int TABLE_VALUE = 5;
/**
* <code>TABLE_CELL = 6;</code>
*/
public static final int TABLE_CELL_VALUE = 6;
/**
* <code>IMAGE = 7;</code>
*/
public static final int IMAGE_VALUE = 7;
/**
* <code>HEADER = 8;</code>
*/
public static final int HEADER_VALUE = 8;
/**
* <code>FOOTER = 9;</code>
*/
public static final int FOOTER_VALUE = 9;
/**
* <code>TABLE_OF_CONTENTS = 10;</code>
*/
public static final int TABLE_OF_CONTENTS_VALUE = 10;
/**
* <code>TABLE_OF_CONTENTS_ITEM = 11;</code>
*/
public static final int TABLE_OF_CONTENTS_ITEM_VALUE = 11;
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new java.lang.IllegalArgumentException(
"Can't get the number of an unknown enum value.");
}
return value;
}
public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code NodeType}
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
public enum NodeType implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>DOCUMENT = 0;</code>
*/
DOCUMENT(0),
/**
* <code>SECTION = 1;</code>
*/
SECTION(1),
/**
* <code>SUPER_SECTION = 2;</code>
*/
SUPER_SECTION(2),
/**
* <code>HEADLINE = 3;</code>
*/
HEADLINE(3),
/**
* <code>PARAGRAPH = 4;</code>
*/
PARAGRAPH(4),
/**
* <code>TABLE = 5;</code>
*/
TABLE(5),
/**
* <code>TABLE_CELL = 6;</code>
*/
TABLE_CELL(6),
/**
* <code>IMAGE = 7;</code>
*/
IMAGE(7),
/**
* <code>HEADER = 8;</code>
*/
HEADER(8),
/**
* <code>FOOTER = 9;</code>
*/
FOOTER(9),
UNRECOGNIZED(-1),
;
@java.lang.Deprecated
public static NodeType valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static NodeType forNumber(int value) {
switch (value) {
case 0: return DOCUMENT;
case 1: return SECTION;
case 2: return SUPER_SECTION;
case 3: return HEADLINE;
case 4: return PARAGRAPH;
case 5: return TABLE;
case 6: return TABLE_CELL;
case 7: return IMAGE;
case 8: return HEADER;
case 9: return FOOTER;
case 10: return TABLE_OF_CONTENTS;
case 11: return TABLE_OF_CONTENTS_ITEM;
default: return null;
}
}
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 27,
/* patch= */ 1,
/* suffix= */ "", NodeType.class.getName());
}
/**
* <code>DOCUMENT = 0;</code>
*/
public static final int DOCUMENT_VALUE = 0;
/**
* <code>SECTION = 1;</code>
*/
public static final int SECTION_VALUE = 1;
/**
* <code>SUPER_SECTION = 2;</code>
*/
public static final int SUPER_SECTION_VALUE = 2;
/**
* <code>HEADLINE = 3;</code>
*/
public static final int HEADLINE_VALUE = 3;
/**
* <code>PARAGRAPH = 4;</code>
*/
public static final int PARAGRAPH_VALUE = 4;
/**
* <code>TABLE = 5;</code>
*/
public static final int TABLE_VALUE = 5;
/**
* <code>TABLE_CELL = 6;</code>
*/
public static final int TABLE_CELL_VALUE = 6;
/**
* <code>IMAGE = 7;</code>
*/
public static final int IMAGE_VALUE = 7;
/**
* <code>HEADER = 8;</code>
*/
public static final int HEADER_VALUE = 8;
/**
* <code>FOOTER = 9;</code>
*/
public static final int FOOTER_VALUE = 9;
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new IllegalArgumentException("Can't get the number of an unknown enum value.");
}
return value;
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
@Deprecated
public static NodeType valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static NodeType forNumber(int value) {
switch (value) {
case 0:
return DOCUMENT;
case 1:
return SECTION;
case 2:
return SUPER_SECTION;
case 3:
return HEADLINE;
case 4:
return PARAGRAPH;
case 5:
return TABLE;
case 6:
return TABLE_CELL;
case 7:
return IMAGE;
case 8:
return HEADER;
case 9:
return FOOTER;
default:
return null;
}
}
public static com.google.protobuf.Internal.EnumLiteMap<NodeType> internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<NodeType> internalValueMap = new com.google.protobuf.Internal.EnumLiteMap<NodeType>() {
public static com.google.protobuf.Internal.EnumLiteMap<NodeType>
internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<
NodeType> internalValueMap =
new com.google.protobuf.Internal.EnumLiteMap<NodeType>() {
public NodeType findValueByNumber(int number) {
return NodeType.forNumber(number);
return NodeType.forNumber(number);
}
};
};
public final com.google.protobuf.Descriptors.EnumValueDescriptor getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new IllegalStateException("Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues()
.get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor getDescriptor() {
return NodeTypeProto.getDescriptor().getEnumTypes()
.get(0);
}
private static final NodeType[] VALUES = values();
public static NodeType valueOf(com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new IllegalArgumentException("EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private NodeType(int value) {
this.value = value;
}
// @@protoc_insertion_point(enum_scope:NodeType)
public final com.google.protobuf.Descriptors.EnumValueDescriptor
getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new java.lang.IllegalStateException(
"Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues().get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor
getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor
getDescriptor() {
return com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.getDescriptor().getEnumTypes().get(0);
}
private static final NodeType[] VALUES = values();
public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
return descriptor;
public static NodeType valueOf(
com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new java.lang.IllegalArgumentException(
"EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
static {
String[] descriptorData = {"\n\016NodeType.proto*\223\001\n\010NodeType\022\014\n\010DOCUMEN"
+ "T\020\000\022\013\n\007SECTION\020\001\022\021\n\rSUPER_SECTION\020\002\022\014\n\010H"
+ "EADLINE\020\003\022\r\n\tPARAGRAPH\020\004\022\t\n\005TABLE\020\005\022\016\n\nT"
+ "ABLE_CELL\020\006\022\t\n\005IMAGE\020\007\022\n\n\006HEADER\020\010\022\n\n\006FO"
+ "OTER\020\tb\006proto3"};
descriptor = com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[]{});
descriptor.resolveAllFeaturesImmutable();
private NodeType(int value) {
this.value = value;
}
// @@protoc_insertion_point(outer_class_scope)
// @@protoc_insertion_point(enum_scope:NodeType)
}
public static com.google.protobuf.Descriptors.FileDescriptor
getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor
descriptor;
static {
java.lang.String[] descriptorData = {
"\n\016NodeType.proto*\306\001\n\010NodeType\022\014\n\010DOCUMEN" +
"T\020\000\022\013\n\007SECTION\020\001\022\021\n\rSUPER_SECTION\020\002\022\014\n\010H" +
"EADLINE\020\003\022\r\n\tPARAGRAPH\020\004\022\t\n\005TABLE\020\005\022\016\n\nT" +
"ABLE_CELL\020\006\022\t\n\005IMAGE\020\007\022\n\n\006HEADER\020\010\022\n\n\006FO" +
"OTER\020\t\022\025\n\021TABLE_OF_CONTENTS\020\n\022\032\n\026TABLE_O" +
"F_CONTENTS_ITEM\020\013BW\nFcom.knecon.fforesig" +
"ht.service.layoutparser.internal.api.dat" +
"a.redactionB\rNodeTypeProtob\006proto3"
};
descriptor = com.google.protobuf.Descriptors.FileDescriptor
.internalBuildGeneratedFileFrom(descriptorData,
new com.google.protobuf.Descriptors.FileDescriptor[] {
});
descriptor.resolveAllFeaturesImmutable();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -1,5 +1,9 @@
syntax = "proto3";
option java_outer_classname = "DocumentPageProto";
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
message AllDocumentPages {
repeated DocumentPage documentPages = 1;

View File

@ -1,5 +1,8 @@
syntax = "proto3";
option java_outer_classname = "DocumentPositionDataProto";
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
message AllDocumentPositionData {
repeated DocumentPositionData documentPositionData = 1;

View File

@ -1,5 +1,9 @@
syntax = "proto3";
option java_outer_classname = "DocumentStructureProto";
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
import "EntryData.proto";
message DocumentStructure {

View File

@ -1,5 +1,8 @@
syntax = "proto3";
option java_outer_classname = "DocumentTextDataProto";
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
message AllDocumentTextData {
repeated DocumentTextData documentTextData = 1;

View File

@ -3,6 +3,9 @@ syntax = "proto3";
import "LayoutEngine.proto";
import "NodeType.proto";
option java_outer_classname = "EntryDataProto";
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
message EntryData {
// Type of the semantic node.
NodeType type = 1;

View File

@ -1,5 +1,6 @@
syntax = "proto3";
option java_outer_classname = "LayoutEngineProto";
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
enum LayoutEngine {
ALGORITHM = 0;
AI = 1;

View File

@ -1,5 +1,8 @@
syntax = "proto3";
option java_outer_classname = "NodeTypeProto";
option java_package = "com.knecon.fforesight.service.layoutparser.internal.api.data.redaction";
enum NodeType {
DOCUMENT = 0;
SECTION = 1;
@ -11,4 +14,6 @@ enum NodeType {
IMAGE = 7;
HEADER = 8;
FOOTER = 9;
TABLE_OF_CONTENTS = 10;
TABLE_OF_CONTENTS_ITEM = 11;
}

View File

@ -0,0 +1,26 @@
#!/bin/bash
# Minimum required protoc version
MIN_VERSION="28.3"
# Get the installed protoc version
INSTALLED_VERSION=$(protoc --version | awk '{print $2}')
# Function to compare versions
version_lt() {
[ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" != "$1" ]
}
# Check if protoc is installed and meets the minimum version
if ! command -v protoc &> /dev/null; then
echo "Error: protoc is not installed. Please install version $MIN_VERSION or later."
exit 1
fi
if version_lt "$INSTALLED_VERSION" "$MIN_VERSION"; then
echo "Error: protoc version $INSTALLED_VERSION is too old. Please upgrade to version $MIN_VERSION or later."
exit 1
fi
# Generate Java files from proto files
protoc --java_out=../java ./*.proto

View File

@ -35,6 +35,4 @@ dependencies {
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
implementation("com.pdftron:PDFNet:10.11.0")
implementation("org.apache.commons:commons-text:1.12.0")
implementation("com.google.protobuf:protobuf-java-util:4.27.1")
}

View File

@ -39,10 +39,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -107,7 +106,7 @@ public class LayoutParsingPipeline {
GraphicExtractorService graphicExtractorService;
OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
TOCEnrichmentService tocEnrichmentService;
SectionTreeBuilderService sectionTreeBuilderService;
LayoutparserSettings settings;
ClassificationService classificationService;
@ -345,14 +344,14 @@ public class LayoutParsingPipeline {
classificationService.classify(classificationDocument, layoutParsingType, identifier);
TableOfContents tableOfContents = outlineValidationService.createToC(classificationDocument);
classificationDocument.setTableOfContents(tableOfContents);
SectionTree sectionTree = outlineValidationService.createSectionTree(classificationDocument);
classificationDocument.setSectionTree(sectionTree);
log.info("Building Sections for {}", identifier);
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
default -> sectionTreeBuilderService.assignSectionBlocksAndImages(classificationDocument);
}
return classificationDocument;

View File

@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
@ -31,6 +31,6 @@ public class ClassificationDocument {
private long rulesVersion;
private OutlineObjectTree outlineObjectTree;
private TableOfContents tableOfContents;
private SectionTree sectionTree;
}

View File

@ -14,6 +14,7 @@ public enum PageBlockType {
PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN,
OTHER,
TABLE_OF_CONTENTS_HEADLINE,
TABLE_OF_CONTENTS_ITEM,
LIST_ITEM,
TABLE;
@ -35,7 +36,7 @@ public enum PageBlockType {
public static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1 -> 1;
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
@ -47,6 +48,6 @@ public enum PageBlockType {
public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
}
}

View File

@ -16,7 +16,7 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?");
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?");
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
public enum Format {

View File

@ -11,6 +11,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
public abstract class AbstractNodeVisitor implements NodeVisitor {
@ -83,6 +85,18 @@ public abstract class AbstractNodeVisitor implements NodeVisitor {
visitChildren(tableCell);
}
@Override
public void visit(TableOfContents toc) {
visitChildren(toc);
}
@Override
public void visit(TableOfContentsItem toci) {
visitChildren(toci);
}
protected void visitChildren(SemanticNode semanticNode) {

View File

@ -10,6 +10,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
import software.amazon.awssdk.utils.builder.ToCopyableBuilder;
public interface NodeVisitor {
@ -42,4 +46,10 @@ public interface NodeVisitor {
void visit(TableCell tableCell);
void visit(TableOfContents tableOfContents);
void visit(TableOfContentsItem tableOfContentsItem);
}

View File

@ -0,0 +1,41 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class TableOfContents extends AbstractSemanticNode {
@Override
public NodeTypeProto.NodeType getType() {
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS;
}
public Headline getHeadline() {
return streamChildrenOfType(NodeTypeProto.NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(() -> getParent().getHeadline());
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -0,0 +1,51 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class TableOfContentsItem extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeTypeProto.NodeType getType() {
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS_ITEM;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
}

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList;
@ -21,20 +22,20 @@ import lombok.extern.slf4j.Slf4j;
public class OutlineValidationService {
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
public TableOfContents createToC(ClassificationDocument classificationDocument) {
public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
List<TableOfContentItem> mainSections = new ArrayList<>();
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
TableOfContentItem last = null;
List<SectionTreeEntry> mainSections = new ArrayList<>();
Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
SectionTreeEntry last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new TableOfContentItem(current);
var tocItem = new SectionTreeEntry(current);
if (parentDepth == null) {
mainSections.add(tocItem);
@ -44,14 +45,16 @@ public class OutlineValidationService {
} else {
assert last != null;
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (lastDepth < parentDepth) {
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
// headline after toc should always start a main section
parentDepth = 1;
} else if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
}
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
@ -60,7 +63,10 @@ public class OutlineValidationService {
depths.add(currentDepth);
}
return new TableOfContents(mainSections);
return new
SectionTree(mainSections);
}

View File

@ -14,12 +14,12 @@ import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class TableOfContents implements Iterable<TableOfContentItem> {
public class SectionTree implements Iterable<SectionTreeEntry> {
private List<TableOfContentItem> mainSections = new ArrayList<>();
private List<SectionTreeEntry> mainSections = new ArrayList<>();
public TableOfContents(List<TableOfContentItem> mainSections) {
public SectionTree(List<SectionTreeEntry> mainSections) {
this.mainSections = mainSections;
}
@ -28,36 +28,36 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
public List<TextPageBlock> getAllTextPageBlocks() {
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
for (TableOfContentItem item : mainSections) {
for (SectionTreeEntry item : mainSections) {
collectTextPageBlocks(item, allTextPageBlocks);
}
return allTextPageBlocks;
}
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
private void collectTextPageBlocks(SectionTreeEntry item, List<TextPageBlock> textPageBlocks) {
textPageBlocks.add(item.getHeadline());
for (TableOfContentItem child : item.getChildren()) {
for (SectionTreeEntry child : item.getChildren()) {
collectTextPageBlocks(child, textPageBlocks);
}
}
public List<TableOfContentItem> getAllTableOfContentItems() {
public List<SectionTreeEntry> getAllTableOfContentItems() {
List<TableOfContentItem> allItems = new ArrayList<>();
for (TableOfContentItem item : mainSections) {
List<SectionTreeEntry> allItems = new ArrayList<>();
for (SectionTreeEntry item : mainSections) {
collectTableOfContentItems(item, allItems);
}
return allItems;
}
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
private void collectTableOfContentItems(SectionTreeEntry item, List<SectionTreeEntry> allItems) {
allItems.add(item);
for (TableOfContentItem child : item.getChildren()) {
for (SectionTreeEntry child : item.getChildren()) {
collectTableOfContentItems(child, allItems);
}
}
@ -65,7 +65,7 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
private boolean containsBlock(TextPageBlock block) {
for (TableOfContentItem existingItem : this.getMainSections()) {
for (SectionTreeEntry existingItem : this.getMainSections()) {
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
return true;
}
@ -74,9 +74,9 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
}
private boolean containsItem(TableOfContentItem tocItem) {
private boolean containsItem(SectionTreeEntry tocItem) {
for (TableOfContentItem existingItem : this.getMainSections()) {
for (SectionTreeEntry existingItem : this.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true;
}
@ -86,18 +86,18 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
@Override
public @NonNull Iterator<TableOfContentItem> iterator() {
public @NonNull Iterator<SectionTreeEntry> iterator() {
return new TableOfContentItemIterator(mainSections);
return new SectionTreeEntryIterator(mainSections);
}
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
private static class SectionTreeEntryIterator implements Iterator<SectionTreeEntry> {
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
private final Stack<Iterator<SectionTreeEntry>> stack = new Stack<>();
TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
SectionTreeEntryIterator(List<SectionTreeEntry> mainSections) {
stack.push(mainSections.iterator());
}
@ -112,10 +112,10 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
@Override
public TableOfContentItem next() {
public SectionTreeEntry next() {
ensureStackTopIsCurrent();
TableOfContentItem currentItem = stack.peek().next();
SectionTreeEntry currentItem = stack.peek().next();
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
stack.push(currentItem.getChildren()
.iterator());

View File

@ -23,28 +23,28 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class TOCEnrichmentService {
public class SectionTreeBuilderService {
public void assignSectionBlocksAndImages(ClassificationDocument document) {
TableOfContents toc = document.getTableOfContents();
Iterator<TableOfContentItem> iterator = toc.iterator();
TableOfContentItem currentTOCItem = null;
SectionTree toc = document.getSectionTree();
Iterator<SectionTreeEntry> iterator = toc.iterator();
SectionTreeEntry currentTOCItem = null;
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>();
TableOfContentItem currentSection = null;
SectionTreeEntry currentSection = null;
boolean foundFirstHeadline = false;
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null;
List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>();
List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
for (ClassificationPage page : document.getPages()) {
List<TableOfContentItem> currentPageTOCItems = new ArrayList<>();
List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
@ -101,7 +101,7 @@ public class TOCEnrichmentService {
Double xMax = null;
Double yMax = null;
for (TableOfContentItem tocItem : lastFoundTOCItems) {
for (SectionTreeEntry tocItem : lastFoundTOCItems) {
var headline = tocItem.getHeadline();
if (headline.getPage() != page.getPageNumber()) {
@ -169,10 +169,10 @@ public class TOCEnrichmentService {
}
if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
TableOfContentItem unassigned = new TableOfContentItem(null);
SectionTreeEntry unassigned = new SectionTreeEntry(null);
unassigned.setSectionBlocks(startBlocks);
unassigned.setImages(startImages);
document.getTableOfContents().getMainSections().add(0, unassigned);
document.getSectionTree().getMainSections().add(0, unassigned);
}
document.setHeaders(headers);
document.setFooters(footers);

View File

@ -5,6 +5,7 @@ import java.util.List;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -14,12 +15,18 @@ import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TableOfContentItem {
public class SectionTreeEntry {
public enum Type {
SECTION,
SUPER_SECTION,
TOC_SECTION
}
@EqualsAndHashCode.Include
private TextPageBlock headline;
private List<TableOfContentItem> children = new ArrayList<>();
private TableOfContentItem parent;
private List<SectionTreeEntry> children = new ArrayList<>();
private SectionTreeEntry parent;
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
@ -27,20 +34,32 @@ public class TableOfContentItem {
private GenericSemanticNode section;
public TableOfContentItem(TextPageBlock headline) {
public SectionTreeEntry(TextPageBlock headline) {
this.headline = headline;
}
public void addChild(TableOfContentItem tableOfContentItem) {
public Type getType() {
children.add(tableOfContentItem);
tableOfContentItem.setParent(this);
if (headline.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_HEADLINE)) {
return Type.TOC_SECTION;
}
if (children.isEmpty()) {
return Type.SECTION;
}
return Type.SUPER_SECTION;
}
public TableOfContentItem getSiblingBefore() {
public void addChild(SectionTreeEntry sectionTreeEntry) {
children.add(sectionTreeEntry);
sectionTreeEntry.setParent(this);
}
public SectionTreeEntry getSiblingBefore() {
if (parent != null) {
int index = parent.getChildren().indexOf(this);
@ -52,7 +71,7 @@ public class TableOfContentItem {
}
public TableOfContentItem getSiblingAfter() {
public SectionTreeEntry getSiblingAfter() {
if (parent != null) {
int index = parent.getChildren().indexOf(this);
@ -69,7 +88,7 @@ public class TableOfContentItem {
if (headline.equals(block)) {
return true;
}
for (TableOfContentItem child : children) {
for (SectionTreeEntry child : children) {
if (child.contains(block)) {
return true;
}
@ -78,12 +97,12 @@ public class TableOfContentItem {
}
public boolean contains(TableOfContentItem tocItem) {
public boolean contains(SectionTreeEntry tocItem) {
if (this.equals(tocItem)) {
return true;
}
for (TableOfContentItem child : children) {
for (SectionTreeEntry child : children) {
if (child.contains(tocItem)) {
return true;
}

View File

@ -1,34 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.Comparator;
import java.util.HashMap;
public class TextPositionSequenceComparator implements Comparator<Word> {
private HashMap<Word, TextBlockOnPage> lookup;
public TextPositionSequenceComparator(HashMap<Word, TextBlockOnPage> lookup) {
this.lookup = lookup;
}
@Override
public int compare(Word number1, Word number2) {
int page1 = lookup.get(number1).page().getPageNumber();
int page2 = lookup.get(number2).page().getPageNumber();
if (page1 != page2) {
return Integer.compare(page1, page2);
}
if (number1.getY() != number2.getY()) {
return Double.compare(number1.getY(), number2.getY());
}
return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString()));
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.Comparator;
import java.util.HashMap;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
public class TocNumberComparator implements Comparator<NumberWord> {
private HashMap<NumberWord, TextBlockOnPage> lookup;
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
this.lookup = lookup;
}
@Override
public int compare(NumberWord number1, NumberWord number2) {
int page1 = lookup.get(number1).page().getPageNumber();
int page2 = lookup.get(number2).page().getPageNumber();
if (page1 != page2) {
return Integer.compare(page1, page2);
}
if (number1.word().getY() != number2.word().getY()) {
return Double.compare(number1.word().getY(), number2.word().getY());
}
return Integer.compare(number1.number(), number2.number());
}
}

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
public record NumberWord(Word word, int number) {
}

View File

@ -14,6 +14,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
@ -23,10 +24,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
@ -59,7 +61,7 @@ public class TableOfContentsClassificationService {
if (end > i + 1) {
if (textBlock.textBlock().getClassification() == null) {
textBlock.textBlock().setClassification(PageBlockType.H1);
textBlock.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_HEADLINE);
}
i = end;
}
@ -71,9 +73,9 @@ public class TableOfContentsClassificationService {
ClassificationPage startPage = textBlocks.get(start).page();
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
List<Word> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
List<NumberWord> numbers = extractNumbers(initialLookAhead, numberToBlockLookup, document.getPages().size());
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, numberToBlockLookup);
int lastCandidate = start;
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
@ -93,28 +95,28 @@ public class TableOfContentsClassificationService {
break;
}
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
List<NumberWord> numbersFromBlock = extractNumbers(textBlockOnPage, numberToBlockLookup, document.getPages().size());
List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
List<NumberWord> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
log.debug("No numbers indicating a table of contents here.");
return start;
}
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
if (anyIntersection(currentRightmostCluster, numbersFromBlock, numberToBlockLookup)) {
lastCandidate = i;
numbersFromBlock.forEach(tocNumberFinder::add);
}
}
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
.stream()
.map(lookup::get)
.map(numberToBlockLookup::get)
.collect(Collectors.toSet());
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, numberToBlockLookup, blocksWithNumberInCluster, textBlocks.get(start - 1));
int lastConfirmed = start;
for (int i = start; i < lastCandidate + 1; i++) {
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
@ -132,18 +134,22 @@ public class TableOfContentsClassificationService {
}
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<Word, TextBlockOnPage> lookup) {
private static void addVisualization(LayoutDebugLayer layoutDebugLayer,
TocNumberFinder tocNumberFinder,
Map<NumberWord, TextBlockOnPage> lookup,
Set<TextBlockOnPage> blocksWithNumberInCluster,
TextBlockOnPage startingHeadline) {
tocNumberFinder.getCurrentRightmostCluster()
.stream()
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
layoutDebugLayer.addTocBlocks(blocksWithNumberInCluster);
layoutDebugLayer.addTocBlocks(Set.of(startingHeadline));
}
private static boolean anyIntersection(Collection<Word> numbers1,
Collection<Word> numbers2,
Map<Word, TextBlockOnPage> lookup) {
private static boolean anyIntersection(Collection<NumberWord> numbers1, Collection<NumberWord> numbers2, Map<NumberWord, TextBlockOnPage> lookup) {
return numbers1.stream()
.anyMatch(numberFromCluster -> numbers2.stream()
@ -151,9 +157,9 @@ public class TableOfContentsClassificationService {
}
private static List<Word> extractNumbers(List<TextBlockOnPage> textBlocks, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
private static List<NumberWord> extractNumbers(List<TextBlockOnPage> textBlocks, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
List<Word> blocks = new LinkedList<>();
List<NumberWord> blocks = new LinkedList<>();
for (TextBlockOnPage textBlock : textBlocks) {
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
}
@ -161,30 +167,40 @@ public class TableOfContentsClassificationService {
}
private static List<Word> extractNumbers(TextBlockOnPage textBlock, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
private static List<NumberWord> extractNumbers(TextBlockOnPage textBlock, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
List<Word> blocks = new LinkedList<>();
List<NumberWord> blocks = new LinkedList<>();
TextPageBlock block = textBlock.textBlock();
List<Word> sequences = block.getWords();
for (int i = 0; i < sequences.size(); i++) {
List<Word> words = block.getWords();
for (int i = 0; i < words.size(); i++) {
Word word = sequences.get(i);
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
Word word = words.get(i);
if (!wordIsEndOfLine(i, words)) {
continue;
}
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) {
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) {
continue;
}
Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString());
if (matcher.find() && matcher.group(2) != null) {
continue;
}
Matcher numberFinder = NUMERIC.matcher(word);
if (!numberFinder.find() || word.length() > 5) {
continue;
}
try {
int pageNumber = Integer.parseInt(word.toString());
int pageNumber = Integer.parseInt(numberFinder.group());
if (0 >= pageNumber || pageNumber > numberOfPages) {
continue;
}
lookup.put(word, textBlock);
blocks.add(word);
NumberWord numberWord = new NumberWord(word, pageNumber);
lookup.put(numberWord, textBlock);
blocks.add(numberWord);
} catch (NumberFormatException e) {
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
}
@ -193,6 +209,17 @@ public class TableOfContentsClassificationService {
}
private static boolean wordIsEndOfLine(int i, List<Word> words) {
if (i == words.size() - 1) {
return true;
}
Word word = words.get(i);
Word nextWord = words.get(i + 1);
return !nextWord.rightOf(word);
}
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
int end = Math.min(i + 5, sequences.size());
@ -203,13 +230,13 @@ public class TableOfContentsClassificationService {
}
private static boolean matches(Word number1, Word number2, Map<Word, TextBlockOnPage> lookup) {
private static boolean matches(NumberWord number1, NumberWord number2, Map<NumberWord, TextBlockOnPage> lookup) {
if (number1.getDir() != number2.getDir()) {
if (number1.word().getDir() != number2.word().getDir()) {
return false;
}
return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE);
return number1.word().intersectsXDirAdj(number2.word(), INTERSECTION_TOLERANCE);
}
@ -247,11 +274,11 @@ public class TableOfContentsClassificationService {
private static class TocNumberFinder {
final UnionFind<Word> numberClusters;
final HashMap<Word, TextBlockOnPage> lookup;
final UnionFind<NumberWord> numberClusters;
final HashMap<NumberWord, TextBlockOnPage> lookup;
TocNumberFinder(List<Word> blocks, HashMap<Word, TextBlockOnPage> lookup) {
TocNumberFinder(List<NumberWord> blocks, HashMap<NumberWord, TextBlockOnPage> lookup) {
this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
for (int i = 0; i < blocks.size(); i++) {
@ -265,14 +292,14 @@ public class TableOfContentsClassificationService {
}
public void add(Word number) {
public void add(NumberWord number) {
if (numberClusters.getElements().contains(number)) {
return;
}
numberClusters.addElement(number);
for (Word element : numberClusters.getElements()) {
for (NumberWord element : numberClusters.getElements()) {
if (matches(number, element, lookup)) {
numberClusters.union(element, number);
}
@ -280,73 +307,100 @@ public class TableOfContentsClassificationService {
}
public List<Word> getCurrentRightmostCluster() {
public List<NumberWord> getCurrentRightmostCluster() {
return numberClusters.getGroups()
.stream()
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
.map(cluster -> cluster.stream()
.sorted(new TextPositionSequenceComparator(lookup))
.sorted(new TocNumberComparator(lookup))
.toList())
.map(this::removeOutliers)
// .map(this::filterByMinimumDensity)
.map(this::removeOnNonConsecutivePages)
.map(this::filterByWordNearTopOfPage)
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
.max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList());
.max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList());
}
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
//
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
//
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
// clustersPerPage.keySet()
// .stream()
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
// .forEach(page -> {
// var numbersOnPage = clustersPerPage.get(page);
//
// double height = numbersOnPage.stream()
// .map(BoundingBox::getBBox)
// .collect(RectangleTransformations.collectBBox()).getHeight();
//
// double count = numbersOnPage.size();
//
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
// result.addAll(numbers);
// }
// });
// return result;
// }
private List<NumberWord> removeOnNonConsecutivePages(List<NumberWord> numbers) {
public List<Word> removeOutliers(List<Word> numbers) {
List<Word> result = new ArrayList<>();
List<NumberWord> result = new ArrayList<>();
result.add(numbers.get(0));
for (int i = 1; i < numbers.size(); i++) {
int prev = getPageNumber(numbers, i - 1);
int curr = getPageNumber(numbers, i);
if (Math.abs(prev - curr) > 1) {
break;
} else {
result.add(numbers.get(i));
}
}
return result;
}
private int getPageNumber(List<NumberWord> numbers, int i) {
return lookup.get(numbers.get(i)).page().getPageNumber();
}
private List<NumberWord> filterByWordNearTopOfPage(List<NumberWord> numbers) {
List<NumberWord> result = new ArrayList<>();
result.add(numbers.get(0));
for (int i = 1; i < numbers.size(); i++) {
NumberWord prev = numbers.get(i - 1);
NumberWord curr = numbers.get(i);
ClassificationPage prevPage = lookup.get(prev).page();
ClassificationPage currPage = lookup.get(curr).page();
if (prevPage == currPage) {
result.add(curr);
} else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) {
result.add(curr);
}
}
return result;
}
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
List<NumberWord> confirmedClusterNumbers = new ArrayList<>();
confirmedClusterNumbers.add(numbers.get(0));
for (int i = 1; i < numbers.size() - 1; i++) {
int prev = getNumberAsInt(numbers, i - 1);
int curr = getNumberAsInt(numbers, i);
int next = getNumberAsInt(numbers, i + 1);
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
result.add(numbers.get(i));
confirmedClusterNumbers.add(numbers.get(i));
}
}
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
result.add(numbers.get(numbers.size() - 1));
if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) {
confirmedClusterNumbers.add(numbers.get(numbers.size() - 1));
}
return result;
return confirmedClusterNumbers;
}
private static int getLatestNumber(List<NumberWord> confirmedClusterNumbers) {
return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number();
}
// Helper method to check if removing the current number results in a better order
public static boolean isBetterWithout(List<Word> numbers, int i) {
public static boolean isBetterWithout(List<NumberWord> numbers, int i) {
if (i == 0 || i == numbers.size() - 1) {
return false;
@ -362,9 +416,9 @@ public class TableOfContentsClassificationService {
}
private static int getNumberAsInt(List<Word> numbers, int i) {
private static int getNumberAsInt(List<NumberWord> numbers, int i) {
return Integer.parseInt(numbers.get(i).toString());
return numbers.get(i).number();
}
}

View File

@ -16,13 +16,14 @@ import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
@ -35,10 +36,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@ -65,7 +67,7 @@ public class DocumentGraphFactory {
document.getPages()
.forEach(context::buildAndAddPageWithCounter);
addSectionsForToC(layoutParsingType, document, context, documentGraph);
addSections(layoutParsingType, document, context, documentGraph);
addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size());
@ -82,9 +84,9 @@ public class DocumentGraphFactory {
documentGraph.streamAllSubNodes()
.filter(SemanticNode::isLeaf)
.filter(node -> !node.getType().equals(NodeType.HEADER))
.filter(node -> !node.getType().equals(NodeType.FOOTER))
.filter(node -> !node.getType().equals(NodeType.IMAGE))
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.HEADER))
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.FOOTER))
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.IMAGE))
.map(SemanticNode::getTextBlock)
.map(TextBlock::getAtomicTextBlocks)
.flatMap(Collection::stream)
@ -92,18 +94,18 @@ public class DocumentGraphFactory {
}
private void addSectionsForToC(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
for (SectionTreeEntry sectionTreeEntry : classificationDocument.getSectionTree()) {
GenericSemanticNode parent = sectionTreeEntry.getParent() == null ? null : sectionTreeEntry.getParent().getSection();
Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
parent,
tocItem.getChildren().isEmpty(),
tocItem.getNonEmptySectionBlocks(),
tocItem.getImages(),
sectionTreeEntry.getType(),
sectionTreeEntry.getNonEmptySectionBlocks(),
sectionTreeEntry.getImages(),
context,
document);
tocItem.setSection(section.orElse(null));
sectionTreeEntry.setSection(section.orElse(null));
}
}
@ -121,6 +123,8 @@ public class DocumentGraphFactory {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM)) {
node = TableOfContentsItem.builder().documentTree(context.getDocumentTree()).build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
}

View File

@ -17,7 +17,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
@ -29,7 +31,7 @@ public class SectionNodeFactory {
public Optional<GenericSemanticNode> addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
boolean isLeaf,
SectionTreeEntry.Type type,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
@ -48,12 +50,11 @@ public class SectionNodeFactory {
return Optional.empty();
}
AbstractSemanticNode section;
if (isLeaf) {
section = Section.builder().documentTree(context.getDocumentTree()).build();
} else {
section = SuperSection.builder().documentTree(context.getDocumentTree()).build();
}
AbstractSemanticNode section = switch (type) {
case SECTION -> Section.builder().documentTree(context.getDocumentTree()).build();
case SUPER_SECTION -> SuperSection.builder().documentTree(context.getDocumentTree()).build();
case TOC_SECTION -> TableOfContents.builder().documentTree(context.getDocumentTree()).build();
};
context.getSections().add(section);
@ -64,13 +65,14 @@ public class SectionNodeFactory {
if (containsTablesAndTextBlocks) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
section,
true,
SectionTreeEntry.Type.SECTION,
subSectionPageBlocks,
emptyList(),
context,
document));
} else if (!isLeaf) {
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
} else if (type.equals(SectionTreeEntry.Type.SUPER_SECTION)) {
// If a SuperSection contains more blocks than just a headline, we add a Section which contains the remaining textblocks.
addSection(layoutParsingType, section, SectionTreeEntry.Type.SECTION, pageBlocks, emptyList(), context, document);
} else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
}

View File

@ -14,6 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
@ -120,7 +121,7 @@ public class TableNodeFactory {
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType,
tableCell,
true,
SectionTreeEntry.Type.SECTION,
cell.getTextBlocks()
.stream()
.map(tb -> (AbstractPageBlock) tb)

View File

@ -12,6 +12,7 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
@ -20,7 +21,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;

View File

@ -1,6 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
@ -9,9 +8,9 @@ import java.util.NoSuchElementException;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
@ -26,6 +25,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
@ -70,13 +71,15 @@ public class DocumentGraphMapper {
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case SUPER_SECTION -> buildSuperSection(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case PARAGRAPH -> buildParagraph(context, entryData.getPropertiesMap());
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList());
case TABLE -> buildTable(context, entryData.getPropertiesMap());
case TABLE_CELL -> buildTableCell(context, entryData.getPropertiesMap());
case IMAGE -> buildImage(context, entryData.getPropertiesMap(), entryData.getPageNumbersList());
case TABLE_OF_CONTENTS -> buildTableOfContents(context);
case TABLE_OF_CONTENTS_ITEM -> buildTableOfContentsItem(context);
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
@ -100,6 +103,18 @@ public class DocumentGraphMapper {
}
private static SemanticNode buildTableOfContents(Context context) {
return TableOfContents.builder().documentTree(context.documentTree).build();
}
private static SemanticNode buildTableOfContentsItem(Context context) {
return TableOfContentsItem.builder().documentTree(context.documentTree).build();
}
private Headline buildHeadline(Context context) {
return Headline.builder().documentTree(context.documentTree).build();
@ -182,13 +197,11 @@ public class DocumentGraphMapper {
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.getDocumentTextDataList()
.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.getDocumentPositionDataList()
.get(Math.toIntExact(atomicTextBlockId)),
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.getDocumentTextDataList().get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.getDocumentPositionDataList().get(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.documentTextDataBlockData.getDocumentTextDataList()
.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
getPage(context.documentTextDataBlockData.getDocumentTextDataList().get(Math.toIntExact(atomicTextBlockId)).getPage(),
context));
}

View File

@ -38,6 +38,7 @@ public class LayoutGridService {
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
document.getLayoutDebugLayer().addSentenceVisualization(document.getTextBlock());
document.getLayoutDebugLayer().addOutlineHeadlines(document);
if (document.getLayoutDebugLayer().isActive()) {
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()), outline);
@ -54,12 +55,13 @@ public class LayoutGridService {
.peek(layoutGrid::addTreeId)
.forEach(semanticNode -> {
switch (semanticNode.getType()) {
case SECTION, SUPER_SECTION -> layoutGrid.addSection(semanticNode);
case SECTION, SUPER_SECTION, TABLE_OF_CONTENTS -> layoutGrid.addSection(semanticNode);
case HEADLINE -> layoutGrid.addHeadline((Headline) semanticNode);
case PARAGRAPH -> layoutGrid.addParagraph((Paragraph) semanticNode);
case TABLE -> layoutGrid.addTable((Table) semanticNode);
case IMAGE -> layoutGrid.addImage((Image) semanticNode);
case HEADER, FOOTER -> layoutGrid.addHeaderOrFooter(semanticNode);
case TABLE_OF_CONTENTS_ITEM -> layoutGrid.addTableOfContentsItem(semanticNode);
}
});
return layoutGrid;

View File

@ -111,8 +111,8 @@ public class PdfVisualisationUtility {
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case SUPER_SECTION, SECTION -> Color.BLACK;
case PARAGRAPH, TABLE_OF_CONTENTS_ITEM -> Color.BLUE;
case SUPER_SECTION, SECTION, TABLE_OF_CONTENTS -> Color.BLACK;
case HEADLINE -> Color.RED;
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;

View File

@ -15,19 +15,25 @@ import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.checkerframework.checker.units.qual.C;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
@ -293,7 +299,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
}
public void addTocPages(List<Word> numbers, int page) {
public void addTocPages(List<NumberWord> numbers, int page) {
if (!active) {
return;
@ -302,13 +308,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
visualizationsOnPage.getColoredRectangles()
.addAll(numbers.stream()
.map(NumberWord::word)
.map(BoundingBox::getBBoxPdf)
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
.toList());
visualizationsOnPage.getColoredRectangles()
.add(new ColoredRectangle(numbers.stream()
.map(BoundingBox::getBBoxPdf)
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
}
@ -332,8 +335,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
if (!active) {
return;
}
int rectSize = 5;
Point2D point2D;
if (outlineObject.getPoint().isPresent()) {
point2D = outlineObject.getPoint().get();
@ -357,10 +362,40 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
if (!active) {
return;
}
for (ListIdentifier listIdentifier : listIdentifiers) {
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
}
}
public void addTocBlocks(Set<TextBlockOnPage> blocksWithNumberInCluster) {
if (!active) {
return;
}
for (TextBlockOnPage textBlockOnPage : blocksWithNumberInCluster) {
getOrCreateVisualizationsOnPage(textBlockOnPage.page().getPageNumber(), this.tocBlocks).getColoredRectangles()
.add(new ColoredRectangle(textBlockOnPage.textBlock().getBBoxPdf(), TOC_COLOR, LINE_WIDTH));
}
}
public void addOutlineHeadlines(Document document) {
if (!active) {
return;
}
document.streamAllSubNodes()
.filter(node -> node.getType().equals(NodeTypeProto.NodeType.HEADLINE))
.filter(node -> node.getEngines().contains(LayoutEngineProto.LayoutEngine.OUTLINE))
.forEach(headline -> headline.getBBox()
.forEach((page, bbox) -> getOrCreateVisualizationsOnPage(page.getNumber(), this.outlineHeadlines).getColoredRectangles()
.add(new ColoredRectangle(bbox, HEADLINE_COLOR, LINE_WIDTH))));
}
}

View File

@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
@ -72,10 +73,12 @@ public class LayoutGrid extends LayoutGridLayerConfig {
public void addHeadline(Headline headline) {
addAsRectangle(headline, headlines, HEADLINE_COLOR);
if (headline.getEngines().contains(LayoutEngine.OUTLINE)) {
addAsRectangle(headline, outlineHeadlines, HEADLINE_COLOR);
if (headline.getParent().getType().equals(NodeTypeProto.NodeType.TABLE_OF_CONTENTS)) {
addAsRectangle(headline, toc, HEADLINE_COLOR);
} else {
addAsRectangle(headline, headlines, HEADLINE_COLOR);
}
}
@ -88,19 +91,10 @@ public class LayoutGrid extends LayoutGridLayerConfig {
public void addTreeId(SemanticNode semanticNode) {
Page page = semanticNode.getFirstPage();
if (semanticNode.getBBox()
.get(page) == null) {
if (semanticNode.getBBox().get(page) == null) {
return;
}
addPlacedText(page,
semanticNode.getBBox()
.get(page),
semanticNode.getBBox()
.get(page),
buildTreeIdString(semanticNode),
1,
treeIds,
TREEID_COLOR);
addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR);
}
@ -124,20 +118,19 @@ public class LayoutGrid extends LayoutGridLayerConfig {
public void addSection(SemanticNode section) {
Map<Page, Rectangle2D> bBoxMap = section.getBBox();
Color color = section.getType().equals(NodeTypeProto.NodeType.TABLE_OF_CONTENTS) ? TOC_COLOR : SECTION_COLOR;
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeTypeProto.NodeType.SECTION)
.toList();
Integer maxChildDepth = subSections.stream()
.map(node -> node.getTreeId().size())
.max(Integer::compareTo)
.orElse(section.getTreeId().size());
.max(Integer::compareTo).orElse(section.getTreeId().size());
int ownDepth = section.getTreeId().size();
Page firstPage = section.getFirstPage();
String treeIdString = buildTreeIdString(section);
if (bBoxMap.values().size() == 1) {
handleSinglePage(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
handleSinglePage(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth, color);
return;
}
List<Page> pagesInOrder = bBoxMap.keySet()
@ -145,12 +138,12 @@ public class LayoutGrid extends LayoutGridLayerConfig {
.sorted(Comparator.comparingInt(Page::getNumber))
.collect(Collectors.toList());
pagesInOrder.remove(0);
handleFirstPageOfSection(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
handleFirstPageOfSection(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth, color);
for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) {
handleForMiddlePageOfSection(section, middlePage, bBoxMap.get(middlePage), treeIdString, maxChildDepth, ownDepth);
handleForMiddlePageOfSection(section, middlePage, bBoxMap.get(middlePage), treeIdString, maxChildDepth, ownDepth, color);
}
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
handleLastPageOfSection(section, lastPage, bBoxMap.get(lastPage), treeIdString, maxChildDepth, ownDepth);
handleLastPageOfSection(section, lastPage, bBoxMap.get(lastPage), treeIdString, maxChildDepth, ownDepth, color);
}
@ -232,33 +225,45 @@ public class LayoutGrid extends LayoutGridLayerConfig {
}
private void handleSinglePage(SemanticNode semanticNode, Page page, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
private void handleSinglePage(SemanticNode semanticNode, Page page, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth, Color color) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// add string to top line
var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH));
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void handleFirstPageOfSection(SemanticNode semanticNode, Page firstPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
private void handleFirstPageOfSection(SemanticNode semanticNode,
Page firstPage,
Rectangle2D rectangle2D,
String treeIdString,
Integer maxChildDepth,
Integer ownDepth,
Color color) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// remove bottom line
result.pageLines().remove(2);
// add string to top line
var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH));
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void handleForMiddlePageOfSection(SemanticNode semanticNode, Page middlePage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
private void handleForMiddlePageOfSection(SemanticNode semanticNode,
Page middlePage,
Rectangle2D rectangle2D,
String treeIdString,
Integer maxChildDepth,
Integer ownDepth,
Color color) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// remove top line
@ -267,23 +272,29 @@ public class LayoutGrid extends LayoutGridLayerConfig {
result.pageLines().remove(1);
// add string to left line
var leftLine = result.pageLines().remove(1);
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH));
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void handleLastPageOfSection(SemanticNode semanticNode, Page lastPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
private void handleLastPageOfSection(SemanticNode semanticNode,
Page lastPage,
Rectangle2D rectangle2D,
String treeIdString,
Integer maxChildDepth,
Integer ownDepth,
Color color) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// remove top line
result.pageLines().remove(0);
// add string to left line
var leftLine = result.pageLines().remove(2);
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH));
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
@ -295,14 +306,14 @@ public class LayoutGrid extends LayoutGridLayerConfig {
Integer maxChildDepth,
Integer ownDepth) {
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), sections).getColoredLines();
Visualizations visualizations = semanticNode.getType().equals(NodeTypeProto.NodeType.TABLE_OF_CONTENTS) ? toc : sections;
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredLines();
int lineWidthModifier = maxChildDepth - ownDepth;
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
SemanticNode highestParent = semanticNode.getHighestParent();
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, maxChildDepth, sections, SECTION_COLOR);
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, maxChildDepth, visualizations, SECTION_COLOR);
var lastPageLines = createLinesFromRectangle(r, page.getRotation());
if (semanticNode instanceof SuperSection) {
@ -347,8 +358,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
List<Double> ys = yStream.collect(Collectors.toList());
ys.remove(0);
Rectangle2D tableBBox = table.getBBox()
.get(page);
Rectangle2D tableBBox = table.getBBox().get(page);
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines();
xs.forEach(x -> {
@ -384,6 +394,12 @@ public class LayoutGrid extends LayoutGridLayerConfig {
}
public void addTableOfContentsItem(SemanticNode semanticNode) {
addAsRectangle(semanticNode, toc, PARAGRAPH_COLOR);
}
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
}

View File

@ -45,7 +45,6 @@ dependencies {
// for integration testing only
testImplementation(project(":viewer-doc-processor"))
testImplementation(project(":layoutparser-service-internal-api"))
testImplementation("com.google.protobuf:protobuf-java-util:4.27.1")
testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}")

View File

@ -83,4 +83,11 @@ class SectionIdentifierTest {
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
}
@Test
void testFalsePositive111() {
SectionIdentifier identifier = SectionIdentifier.fromSearchText("111: Headline");
assertEquals(SectionIdentifier.Format.NUMERICAL, identifier.getFormat());
assertEquals(1, identifier.level());
}
}

View File

@ -10,7 +10,6 @@ import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.function.Predicate;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.BeforeEach;
@ -28,7 +27,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
@ -100,10 +99,10 @@ public class OutlineDetectionTest extends AbstractTest {
.flatMap(Collection::stream)
.allMatch(OutlineObject::isFound));
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
SectionTree sectionTree = classificationDocument.getSectionTree();
assertEquals(tableOfContents.getMainSections().size(), 9);
assertEquals(tableOfContents.getMainSections().subList(1, 9)
assertEquals(sectionTree.getMainSections().size(), 9);
assertEquals(sectionTree.getMainSections().subList(1, 9)
.stream()
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
.toList(),
@ -121,14 +120,14 @@ public class OutlineDetectionTest extends AbstractTest {
// assertEquals(tableOfContents.getMainSections().get(6).getImages().size(), 1);
// assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().get(2).getImages().size(), 1);
assertTrue(tableOfContents.getAllTableOfContentItems()
assertTrue(sectionTree.getAllTableOfContentItems()
.stream()
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() != null));
assertTrue(tableOfContents.getAllTableOfContentItems()
assertTrue(sectionTree.getAllTableOfContentItems()
.stream()
.filter(tableOfContentItem -> tableOfContentItem.getChildren().isEmpty())
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof Section));
assertTrue(tableOfContents.getAllTableOfContentItems()
assertTrue(sectionTree.getAllTableOfContentItems()
.stream()
.filter(tableOfContentItem -> !tableOfContentItem.getChildren().isEmpty())
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof SuperSection));

View File

@ -150,14 +150,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
assertThat(document.getTableOfContents().getAllTableOfContentItems()
assertThat(document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
var tables = document.getTableOfContents().getAllTableOfContentItems()
var tables = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -203,14 +203,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getTableOfContents().getAllTableOfContentItems()
assertThat(document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
TablePageBlock table = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -233,14 +233,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getTableOfContents().getAllTableOfContentItems()
assertThat(document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
TablePageBlock firstTable = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -250,7 +250,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -280,14 +280,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getTableOfContents().getAllTableOfContentItems()
assertThat(document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
TablePageBlock firstTable = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -297,7 +297,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -327,14 +327,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getTableOfContents().getAllTableOfContentItems()
assertThat(document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()).isNotEmpty();
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
TablePageBlock firstTable = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -344,7 +344,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -844,7 +844,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
private void toHtml(ClassificationDocument document, String filename) {
var tables = document.getTableOfContents().getAllTableOfContentItems()
var tables = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -871,7 +871,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
TablePageBlock table = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -901,7 +901,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
TablePageBlock table = document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()
@ -929,7 +929,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTableSize(ClassificationDocument document, int tableSize) {
assertThat(document.getTableOfContents().getAllTableOfContentItems()
assertThat(document.getSectionTree().getAllTableOfContentItems()
.stream()
.flatMap(tocItem -> tocItem.getSectionBlocks()
.stream()

View File

@ -18,8 +18,8 @@ import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;

View File

@ -227,9 +227,9 @@ public class PdfDraw {
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case PARAGRAPH, TABLE_OF_CONTENTS_ITEM -> Color.BLUE;
case HEADLINE -> Color.RED;
case SECTION, SUPER_SECTION -> Color.BLACK;
case SECTION, SUPER_SECTION, TABLE_OF_CONTENTS -> Color.BLACK;
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;

View File

@ -40,7 +40,7 @@ public record LayerIdentifier(String name, String markedContentName) {
public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES");
public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES");
public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs");
public static final LayerIdentifier OUTLINE_HEADLINES = new LayerIdentifier("Outline Headlines", "OUTLINE_HEADLINES");
public static final LayerIdentifier KNECON_LAYOUT_TOC = new LayerIdentifier("Table of Contents", "TABLE_OF_CONTENTS");
//layout grid debug
public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT");
@ -55,8 +55,10 @@ public record LayerIdentifier(String name, String markedContentName) {
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
public static final LayerIdentifier OUTLINE_HEADLINES = new LayerIdentifier("Outline Headlines", "OUTLINE_HEADLINES");
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
public static final LayerIdentifier TOC_BLOCKS = new LayerIdentifier("TOC blocks", "TOC_BLOCKS");
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
// Visual layout parser

View File

@ -22,6 +22,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
protected static final Color LINES_COLOR = new Color(152, 45, 179);
protected static final Color TOC_COLOR = new Color(33, 159, 144);
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
@ -31,6 +32,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
protected static final Color HEADLINE_COLOR = new Color(162, 56, 56);
protected static final Color CELLS_COLOR = new Color(31, 214, 27);
protected static final Color OUTLINE_OBJECT_COLOR = new Color(214, 27, 183);
@ -59,8 +62,9 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
protected final Visualizations tocBlocks = Visualizations.builder().layer(LayerIdentifier.TOC_BLOCKS).build();
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
protected final Visualizations outlineHeadlines = Visualizations.builder().layer(LayerIdentifier.OUTLINE_HEADLINES).build();
public List<Visualizations> getVisualizations() {
@ -76,10 +80,11 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
mainBody, //
markedContent, //
outlineObjects, //
outlineHeadlines, //
tocPages, //
tocBlocks, //
listIdentifiers //
);
}
}

View File

@ -34,8 +34,10 @@ public class LayoutGridLayerConfig extends AbstractLayerGroup {
protected static final Color KEY_VALUE_BBOX_COLOR = new Color(0, 39, 85);
protected static final Color KEY_COLOR = new Color(30, 92, 172);
protected static final Color VALUE_COLOR = new Color(30, 172, 146);
protected static final Color TOC_COLOR = new Color(0, 86, 198);
protected final Visualizations sections = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_SECTION).visibleByDefault(true).build();
protected final Visualizations toc = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TOC).visibleByDefault(true).build();
protected final Visualizations paragraphs = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_PARAGRAPH).visibleByDefault(true).build();
protected final Visualizations headlines = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_HEADLINE).visibleByDefault(true).build();
protected final Visualizations tables = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TABLE).visibleByDefault(true).build();
@ -44,12 +46,12 @@ public class LayoutGridLayerConfig extends AbstractLayerGroup {
protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build();
protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build();
protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build();
protected final Visualizations outlineHeadlines = Visualizations.builder().layer(LayerIdentifier.OUTLINE_HEADLINES).build();
@Override
public List<Visualizations> getVisualizations() {
return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds, outlineHeadlines);
return List.of(headlines, paragraphs, tables, sections, headerFooter, toc, keyValue, figures, images, treeIds);
}
}