Compare commits

...

45 Commits

Author SHA1 Message Date
Dominique Eifländer
fdd2b954fe Merge branch 'OPS-284' into 'master'
OPS-284: add prometheus endpoint

Closes OPS-284

See merge request redactmanager/redaction-service!582
2025-01-24 11:00:50 +01:00
Christoph Schabert
2d3a048487 OPS-284: add prometheus endpoint 2025-01-23 13:39:25 +01:00
Maverick Studer
518c38c2e9 Merge branch 'RED-10687' into 'master'
RED-10687: Filter out overlapping recommendations if they are of the same type

Closes RED-10687

See merge request redactmanager/redaction-service!580
2025-01-08 13:19:37 +01:00
maverickstuder
21097a6419 RED-10687: Filter out overlapping recommendations if they are of the same type 2025-01-08 11:34:38 +01:00
Maverick Studer
c8dd167606 Merge branch 'RED-10633' into 'master'
RED-10633: Duplicated values when extracting from table in DM 1.3.0

Closes RED-10633

See merge request redactmanager/redaction-service!579
2024-12-13 09:24:10 +01:00
maverickstuder
9bd5577986 RED-10633: Duplicated values when extracting from table in DM 1.3.0 2024-12-12 13:23:17 +01:00
Maverick Studer
c1990ef4aa Merge branch 'RED-10639-fp' into 'master'
RED-10639: RM-224: ERROR state for document after re-upload same document with...

Closes RED-10639

See merge request redactmanager/redaction-service!577
2024-12-12 12:57:47 +01:00
Maverick Studer
3dfa05bd67 RED-10639: RM-224: ERROR state for document after re-upload same document with... 2024-12-12 12:57:46 +01:00
Dominique Eifländer
22b2a6474b Merge branch 'RED-10644-master' into 'master'
RED-10644: Fixed dublicated entries with whitespace at the end

Closes RED-10644

See merge request redactmanager/redaction-service!573
2024-12-10 12:41:36 +01:00
Dominique Eifländer
cf21b75f2e RED-10644: Fixed dublicated entries with whitespace at the end 2024-12-10 12:41:36 +01:00
Maverick Studer
a1e6361c3e Merge branch 'feature/RED-10200' into 'master'
RED-10200: Spike performant rules update logic

Closes RED-10200

See merge request redactmanager/redaction-service!572
2024-12-04 14:41:24 +01:00
Maverick Studer
3c2db795c8 RED-10200: Spike performant rules update logic 2024-12-04 14:41:24 +01:00
Dominique Eifländer
ef1810b658 Merge branch 'RED-10526' into 'master'
RED-10526: Set liquibase to 4.29.2 as 4.30.0 is 3 times slower

Closes RED-10526

See merge request redactmanager/redaction-service!571
2024-12-02 11:27:50 +01:00
Dominique Eifländer
26025a5621 RED-10526: Set liquibase to 4.29.2 as 4.30.0 is 3 times slower 2024-12-02 11:27:50 +01:00
Dominique Eifländer
4fa91a59e0 Merge branch 'RED-10526' into 'master'
RED-10526: Upgrade liquibase to 4.30.0

Closes RED-10526

See merge request redactmanager/redaction-service!569
2024-11-27 11:26:32 +01:00
Dominique Eifländer
7c37776af4 RED-10526: Upgrade liquibase to 4.30.0 2024-11-27 11:13:27 +01:00
Corina Olariu
9000f755a3 Merge branch 'RED-3300' into 'master'
RED-3300 Improve impurity rule

Closes RED-3300

See merge request redactmanager/redaction-service!567
2024-11-26 13:10:14 +01:00
Corina Olariu
62ec63cc55 RED-3300 Improve impurity rule 2024-11-26 13:10:14 +01:00
Corina Olariu
db59ae014b Merge branch 'RED-10046' into 'master'
RED-10046 The punctuation mark “.” should be treated as a word boundary when...

Closes RED-10046

See merge request redactmanager/redaction-service!565
2024-11-21 10:20:44 +01:00
Corina Olariu
dfd262e9e1 RED-10046 The punctuation mark “.” should be treated as a word boundary when... 2024-11-21 10:20:44 +01:00
Maverick Studer
4fd36768b2 Merge branch 'hotfix-persistence-dependency' into 'master'
persistence dependency update

See merge request redactmanager/redaction-service!566
2024-11-20 11:34:08 +01:00
Maverick Studer
e04c6dadd7 persistence dependency update 2024-11-20 11:34:07 +01:00
Maverick Studer
213d3bf645 Merge branch 'feature/RED-10115' into 'master'
RED-10115: Refactoring of justifications

Closes RED-10115

See merge request redactmanager/redaction-service!559
2024-11-20 10:53:47 +01:00
Maverick Studer
66f3f6ce59 RED-10115: Refactoring of justifications 2024-11-20 10:53:47 +01:00
Dominique Eifländer
3f606ad567 Merge branch 'RED-10456-master' into 'master'
RED-10456: Enabled to remove imported redactions

Closes RED-10456

See merge request redactmanager/redaction-service!563
2024-11-18 13:18:42 +01:00
Dominique Eifländer
7b1c6beb11 RED-10456: Enabled to remove imported redactions 2024-11-18 12:51:33 +01:00
Kilian Schuettler
e660184646 RED-9139-pageOrderHotfix 2024-11-15 16:33:44 +01:00
Kilian Schuettler
f1f2d02266 RED-9139-pageOrderHotfix 2024-11-15 16:28:13 +01:00
Kilian Schüttler
947cbe4cd2 Merge branch 'RED-9139' into 'master'
RED-9139: refactor some code in DocumentGraphMapper

Closes RED-9139

See merge request redactmanager/redaction-service!560
2024-11-15 15:37:49 +01:00
Kilian Schuettler
e8dc37374e RED-9139: refactor some code in DocumentGraphMapper 2024-11-15 15:25:32 +01:00
Kilian Schuettler
8769922bf2 RED-9139: fix Image IDs 2024-11-15 15:15:49 +01:00
Corina Olariu
21f2ded6c6 Merge branch 'RED-10425' into 'master'
RED-10425 Annotation added twice when bulk-force while auto-analysis is disabled

Closes RED-10425

See merge request redactmanager/redaction-service!557
2024-11-15 09:28:13 +01:00
Corina Olariu
9f20a14aec RED-10425 Annotation added twice when bulk-force while auto-analysis is disabled 2024-11-15 09:28:13 +01:00
Maverick Studer
681d6328ef Merge branch 'RED-10471-fp' into 'master'
RED-10471: PII.11.0 does not redact anymore

Closes RED-10471

See merge request redactmanager/redaction-service!556
2024-11-14 18:41:51 +01:00
Maverick Studer
97c23c367e RED-10471: PII.11.0 does not redact anymore 2024-11-14 18:41:50 +01:00
Kilian Schüttler
1b7c59d292 Merge branch 'feature/RED-9139' into 'master'
RED-9139: move document to its own module, add TableOfContents and TableOfContentsItem

Closes RED-9139

See merge request redactmanager/redaction-service!554
2024-11-14 16:50:42 +01:00
Kilian Schüttler
f9d939958f RED-9139: move document to its own module, add TableOfContents and TableOfContentsItem 2024-11-14 16:50:42 +01:00
Maverick Studer
41f824297c Merge branch 'feature/RED-10290' into 'master'
RED-10290: Improve SearchImplementation logic for dictionaries

Closes RED-10290

See merge request redactmanager/redaction-service!553
2024-11-11 12:10:58 +01:00
Maverick Studer
68f75f070c RED-10290: Improve SearchImplementation logic for dictionaries 2024-11-11 12:10:57 +01:00
Dominique Eifländer
4c19be01c6 Merge branch 'RED-10353-master' into 'master'
RED-10353: Fixed missing errorCode when rules are locked

Closes RED-10353

See merge request redactmanager/redaction-service!552
2024-11-11 11:43:48 +01:00
Dominique Eifländer
98ba463639 RED-10353: Fixed missing errorCode when rules are locked 2024-11-08 12:39:41 +01:00
Maverick Studer
e415234bf8 Merge branch 'feature/RED-10072' into 'master'
RED-10072: AI description field and toggle for entities

Closes RED-10072

See merge request redactmanager/redaction-service!539
2024-11-07 14:43:51 +01:00
Maverick Studer
7f96c7b51e RED-10072: AI description field and toggle for entities 2024-11-07 14:43:51 +01:00
Dominique Eifländer
a0d3c4cf86 Merge branch 'RED-10353-master' into 'master'
RED-10353: Added error code for file that causes the timeout

Closes RED-10353

See merge request redactmanager/redaction-service!550
2024-11-06 11:19:29 +01:00
Dominique Eifländer
d2a768d9f5 RED-10353: Added error code for file that causes the timeout 2024-11-06 11:05:40 +01:00
225 changed files with 141771 additions and 3480 deletions

View File

@ -7,20 +7,21 @@ include:
ref: 'main'
file: 'ci-templates/gradle_java.yml'
deploy JavaDoc:
publish dependencies:
stage: deploy
tags:
- dind
script:
- echo "Building JavaDoc with gradle version ${BUILDVERSION}"
- echo "Publishing dependencies with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- echo "BUILDVERSION=$(echo ${BUILDVERSION})" >> variables.env
artifacts:
reports:
dotenv: variables.env
dotenv: variables.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_BRANCH =~ /^feature/
- if: $CI_COMMIT_TAG
generate JavaDoc:
@ -42,7 +43,7 @@ pages:
stage: deploy
needs:
- generate JavaDoc
- deploy JavaDoc
- publish dependencies
- calculate minor version
pages:
path_prefix: "$BUILDVERSION"

View File

@ -15,8 +15,13 @@ pmd {
isConsoleOutput = true
}
tasks.checkstyleMain {
exclude("**/data/**") // ignore generated proto files
}
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
exclude("**/data/**") // ignore generated proto files
}
tasks.pmdTest {

View File

@ -0,0 +1,35 @@
plugins {
id("com.iqser.red.service.java-conventions")
id("io.freefair.lombok") version "8.4"
}
description = "redaction-service-document"
val persistenceServiceVersion = "2.612.0-RED10072.1"
val layoutParserVersion = "newNode"
group = "com.knecon.fforesight"
dependencies {
implementation("com.iqser.red.service:persistence-service-internal-api-v1:${persistenceServiceVersion}")
api("com.google.protobuf:protobuf-java-util:4.28.3")
testImplementation("org.junit.jupiter:junit-jupiter-api:5.8.1")
testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:5.8.1")
}
publishing {
publications {
create<MavenPublication>(name) {
from(components["java"])
}
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
}

View File

@ -1,13 +1,12 @@
package com.iqser.red.service.redaction.v1.server.model.document;
package com.iqser.red.service.redaction.v1.server.data;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto.DocumentStructure;
import static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure;
import java.io.Serializable;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.AllDocumentPages;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import com.iqser.red.service.redaction.v1.server.data.DocumentPageProto.AllDocumentPages;
import com.iqser.red.service.redaction.v1.server.data.DocumentPositionDataProto.AllDocumentPositionData;
import com.iqser.red.service.redaction.v1.server.data.DocumentTextDataProto.AllDocumentTextData;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;

View File

@ -0,0 +1,694 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: DocumentStructure.proto
// Protobuf Java Version: 4.28.3
package com.iqser.red.service.redaction.v1.server.data;
public final class DocumentStructureProto {
private DocumentStructureProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
DocumentStructureProto.class.getName());
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions(
(com.google.protobuf.ExtensionRegistryLite) registry);
}
public interface DocumentStructureOrBuilder extends
// @@protoc_insertion_point(interface_extends:DocumentStructure)
com.google.protobuf.MessageOrBuilder {
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
* @return Whether the root field is set.
*/
boolean hasRoot();
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
* @return The root.
*/
com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData getRoot();
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryDataOrBuilder getRootOrBuilder();
}
/**
* Protobuf type {@code DocumentStructure}
*/
public static final class DocumentStructure extends
com.google.protobuf.GeneratedMessage implements
// @@protoc_insertion_point(message_implements:DocumentStructure)
DocumentStructureOrBuilder {
private static final long serialVersionUID = 0L;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
DocumentStructure.class.getName());
}
// Use DocumentStructure.newBuilder() to construct.
private DocumentStructure(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
super(builder);
}
private DocumentStructure() {
}
public static final com.google.protobuf.Descriptors.Descriptor
getDescriptor() {
return com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.internal_static_DocumentStructure_descriptor;
}
@java.lang.Override
protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
internalGetFieldAccessorTable() {
return com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.internal_static_DocumentStructure_fieldAccessorTable
.ensureFieldAccessorsInitialized(
com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure.class, com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure.Builder.class);
}
private int bitField0_;
public static final int ROOT_FIELD_NUMBER = 1;
private com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData root_;
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
* @return Whether the root field is set.
*/
@java.lang.Override
public boolean hasRoot() {
return ((bitField0_ & 0x00000001) != 0);
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
* @return The root.
*/
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData getRoot() {
return root_ == null ? com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.getDefaultInstance() : root_;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryDataOrBuilder getRootOrBuilder() {
return root_ == null ? com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.getDefaultInstance() : root_;
}
private byte memoizedIsInitialized = -1;
@java.lang.Override
public final boolean isInitialized() {
byte isInitialized = memoizedIsInitialized;
if (isInitialized == 1) return true;
if (isInitialized == 0) return false;
memoizedIsInitialized = 1;
return true;
}
@java.lang.Override
public void writeTo(com.google.protobuf.CodedOutputStream output)
throws java.io.IOException {
if (((bitField0_ & 0x00000001) != 0)) {
output.writeMessage(1, getRoot());
}
getUnknownFields().writeTo(output);
}
@java.lang.Override
public int getSerializedSize() {
int size = memoizedSize;
if (size != -1) return size;
size = 0;
if (((bitField0_ & 0x00000001) != 0)) {
size += com.google.protobuf.CodedOutputStream
.computeMessageSize(1, getRoot());
}
size += getUnknownFields().getSerializedSize();
memoizedSize = size;
return size;
}
@java.lang.Override
public boolean equals(final java.lang.Object obj) {
if (obj == this) {
return true;
}
if (!(obj instanceof com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure)) {
return super.equals(obj);
}
com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure other = (com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure) obj;
if (hasRoot() != other.hasRoot()) return false;
if (hasRoot()) {
if (!getRoot()
.equals(other.getRoot())) return false;
}
if (!getUnknownFields().equals(other.getUnknownFields())) return false;
return true;
}
@java.lang.Override
public int hashCode() {
if (memoizedHashCode != 0) {
return memoizedHashCode;
}
int hash = 41;
hash = (19 * hash) + getDescriptor().hashCode();
if (hasRoot()) {
hash = (37 * hash) + ROOT_FIELD_NUMBER;
hash = (53 * hash) + getRoot().hashCode();
}
hash = (29 * hash) + getUnknownFields().hashCode();
memoizedHashCode = hash;
return hash;
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(
java.nio.ByteBuffer data)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(
java.nio.ByteBuffer data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(
com.google.protobuf.ByteString data)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(
com.google.protobuf.ByteString data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(byte[] data)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(
byte[] data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(java.io.InputStream input)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseWithIOException(PARSER, input);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(
java.io.InputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseWithIOException(PARSER, input, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseDelimitedFrom(java.io.InputStream input)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseDelimitedWithIOException(PARSER, input);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseDelimitedFrom(
java.io.InputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseDelimitedWithIOException(PARSER, input, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(
com.google.protobuf.CodedInputStream input)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseWithIOException(PARSER, input);
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure parseFrom(
com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseWithIOException(PARSER, input, extensionRegistry);
}
@java.lang.Override
public Builder newBuilderForType() { return newBuilder(); }
public static Builder newBuilder() {
return DEFAULT_INSTANCE.toBuilder();
}
public static Builder newBuilder(com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure prototype) {
return DEFAULT_INSTANCE.toBuilder().mergeFrom(prototype);
}
@java.lang.Override
public Builder toBuilder() {
return this == DEFAULT_INSTANCE
? new Builder() : new Builder().mergeFrom(this);
}
@java.lang.Override
protected Builder newBuilderForType(
com.google.protobuf.GeneratedMessage.BuilderParent parent) {
Builder builder = new Builder(parent);
return builder;
}
/**
* Protobuf type {@code DocumentStructure}
*/
public static final class Builder extends
com.google.protobuf.GeneratedMessage.Builder<Builder> implements
// @@protoc_insertion_point(builder_implements:DocumentStructure)
com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructureOrBuilder {
public static final com.google.protobuf.Descriptors.Descriptor
getDescriptor() {
return com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.internal_static_DocumentStructure_descriptor;
}
@java.lang.Override
protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
internalGetFieldAccessorTable() {
return com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.internal_static_DocumentStructure_fieldAccessorTable
.ensureFieldAccessorsInitialized(
com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure.class, com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure.Builder.class);
}
// Construct using com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure.newBuilder()
private Builder() {
maybeForceBuilderInitialization();
}
private Builder(
com.google.protobuf.GeneratedMessage.BuilderParent parent) {
super(parent);
maybeForceBuilderInitialization();
}
private void maybeForceBuilderInitialization() {
if (com.google.protobuf.GeneratedMessage
.alwaysUseFieldBuilders) {
getRootFieldBuilder();
}
}
@java.lang.Override
public Builder clear() {
super.clear();
bitField0_ = 0;
root_ = null;
if (rootBuilder_ != null) {
rootBuilder_.dispose();
rootBuilder_ = null;
}
return this;
}
@java.lang.Override
public com.google.protobuf.Descriptors.Descriptor
getDescriptorForType() {
return com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.internal_static_DocumentStructure_descriptor;
}
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure getDefaultInstanceForType() {
return com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure.getDefaultInstance();
}
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure build() {
com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure result = buildPartial();
if (!result.isInitialized()) {
throw newUninitializedMessageException(result);
}
return result;
}
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure buildPartial() {
com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure result = new com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure(this);
if (bitField0_ != 0) { buildPartial0(result); }
onBuilt();
return result;
}
private void buildPartial0(com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure result) {
int from_bitField0_ = bitField0_;
int to_bitField0_ = 0;
if (((from_bitField0_ & 0x00000001) != 0)) {
result.root_ = rootBuilder_ == null
? root_
: rootBuilder_.build();
to_bitField0_ |= 0x00000001;
}
result.bitField0_ |= to_bitField0_;
}
@java.lang.Override
public Builder mergeFrom(com.google.protobuf.Message other) {
if (other instanceof com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure) {
return mergeFrom((com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure)other);
} else {
super.mergeFrom(other);
return this;
}
}
public Builder mergeFrom(com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure other) {
if (other == com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure.getDefaultInstance()) return this;
if (other.hasRoot()) {
mergeRoot(other.getRoot());
}
this.mergeUnknownFields(other.getUnknownFields());
onChanged();
return this;
}
@java.lang.Override
public final boolean isInitialized() {
return true;
}
@java.lang.Override
public Builder mergeFrom(
com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws java.io.IOException {
if (extensionRegistry == null) {
throw new java.lang.NullPointerException();
}
try {
boolean done = false;
while (!done) {
int tag = input.readTag();
switch (tag) {
case 0:
done = true;
break;
case 10: {
input.readMessage(
getRootFieldBuilder().getBuilder(),
extensionRegistry);
bitField0_ |= 0x00000001;
break;
} // case 10
default: {
if (!super.parseUnknownField(input, extensionRegistry, tag)) {
done = true; // was an endgroup tag
}
break;
} // default:
} // switch (tag)
} // while (!done)
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
throw e.unwrapIOException();
} finally {
onChanged();
} // finally
return this;
}
private int bitField0_;
private com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData root_;
private com.google.protobuf.SingleFieldBuilder<
com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData, com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.Builder, com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryDataOrBuilder> rootBuilder_;
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
* @return Whether the root field is set.
*/
public boolean hasRoot() {
return ((bitField0_ & 0x00000001) != 0);
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
* @return The root.
*/
public com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData getRoot() {
if (rootBuilder_ == null) {
return root_ == null ? com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.getDefaultInstance() : root_;
} else {
return rootBuilder_.getMessage();
}
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder setRoot(com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData value) {
if (rootBuilder_ == null) {
if (value == null) {
throw new NullPointerException();
}
root_ = value;
} else {
rootBuilder_.setMessage(value);
}
bitField0_ |= 0x00000001;
onChanged();
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder setRoot(
com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.Builder builderForValue) {
if (rootBuilder_ == null) {
root_ = builderForValue.build();
} else {
rootBuilder_.setMessage(builderForValue.build());
}
bitField0_ |= 0x00000001;
onChanged();
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder mergeRoot(com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData value) {
if (rootBuilder_ == null) {
if (((bitField0_ & 0x00000001) != 0) &&
root_ != null &&
root_ != com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.getDefaultInstance()) {
getRootBuilder().mergeFrom(value);
} else {
root_ = value;
}
} else {
rootBuilder_.mergeFrom(value);
}
if (root_ != null) {
bitField0_ |= 0x00000001;
onChanged();
}
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public Builder clearRoot() {
bitField0_ = (bitField0_ & ~0x00000001);
root_ = null;
if (rootBuilder_ != null) {
rootBuilder_.dispose();
rootBuilder_ = null;
}
onChanged();
return this;
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.Builder getRootBuilder() {
bitField0_ |= 0x00000001;
onChanged();
return getRootFieldBuilder().getBuilder();
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
public com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryDataOrBuilder getRootOrBuilder() {
if (rootBuilder_ != null) {
return rootBuilder_.getMessageOrBuilder();
} else {
return root_ == null ?
com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.getDefaultInstance() : root_;
}
}
/**
* <pre>
* The root EntryData represents the Document.
* </pre>
*
* <code>.EntryData root = 1;</code>
*/
private com.google.protobuf.SingleFieldBuilder<
com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData, com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.Builder, com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryDataOrBuilder>
getRootFieldBuilder() {
if (rootBuilder_ == null) {
rootBuilder_ = new com.google.protobuf.SingleFieldBuilder<
com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData, com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData.Builder, com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryDataOrBuilder>(
getRoot(),
getParentForChildren(),
isClean());
root_ = null;
}
return rootBuilder_;
}
// @@protoc_insertion_point(builder_scope:DocumentStructure)
}
// @@protoc_insertion_point(class_scope:DocumentStructure)
private static final com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure DEFAULT_INSTANCE;
static {
DEFAULT_INSTANCE = new com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure();
}
public static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure getDefaultInstance() {
return DEFAULT_INSTANCE;
}
private static final com.google.protobuf.Parser<DocumentStructure>
PARSER = new com.google.protobuf.AbstractParser<DocumentStructure>() {
@java.lang.Override
public DocumentStructure parsePartialFrom(
com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws com.google.protobuf.InvalidProtocolBufferException {
Builder builder = newBuilder();
try {
builder.mergeFrom(input, extensionRegistry);
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
throw e.setUnfinishedMessage(builder.buildPartial());
} catch (com.google.protobuf.UninitializedMessageException e) {
throw e.asInvalidProtocolBufferException().setUnfinishedMessage(builder.buildPartial());
} catch (java.io.IOException e) {
throw new com.google.protobuf.InvalidProtocolBufferException(e)
.setUnfinishedMessage(builder.buildPartial());
}
return builder.buildPartial();
}
};
public static com.google.protobuf.Parser<DocumentStructure> parser() {
return PARSER;
}
@java.lang.Override
public com.google.protobuf.Parser<DocumentStructure> getParserForType() {
return PARSER;
}
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure getDefaultInstanceForType() {
return DEFAULT_INSTANCE;
}
}
private static final com.google.protobuf.Descriptors.Descriptor
internal_static_DocumentStructure_descriptor;
private static final
com.google.protobuf.GeneratedMessage.FieldAccessorTable
internal_static_DocumentStructure_fieldAccessorTable;
public static com.google.protobuf.Descriptors.FileDescriptor
getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor
descriptor;
static {
java.lang.String[] descriptorData = {
"\n\027DocumentStructure.proto\032\017EntryData.pro" +
"to\"-\n\021DocumentStructure\022\030\n\004root\030\001 \001(\0132\n." +
"EntryDataBH\n.com.iqser.red.service.redac" +
"tion.v1.server.dataB\026DocumentStructurePr" +
"otob\006proto3"
};
descriptor = com.google.protobuf.Descriptors.FileDescriptor
.internalBuildGeneratedFileFrom(descriptorData,
new com.google.protobuf.Descriptors.FileDescriptor[] {
com.iqser.red.service.redaction.v1.server.data.EntryDataProto.getDescriptor(),
});
internal_static_DocumentStructure_descriptor =
getDescriptor().getMessageTypes().get(0);
internal_static_DocumentStructure_fieldAccessorTable = new
com.google.protobuf.GeneratedMessage.FieldAccessorTable(
internal_static_DocumentStructure_descriptor,
new java.lang.String[] { "Root", });
descriptor.resolveAllFeaturesImmutable();
com.iqser.red.service.redaction.v1.server.data.EntryDataProto.getDescriptor();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -0,0 +1,115 @@
package com.iqser.red.service.redaction.v1.server.data;
import static com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure;
import static com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData;
import java.awt.geom.Rectangle2D;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Stream;
import lombok.AllArgsConstructor;
import lombok.Getter;
@Getter
@AllArgsConstructor
public class DocumentStructureWrapper implements Serializable {
private final DocumentStructure documentStructure;
public static class TableProperties implements Serializable {
public static final String NUMBER_OF_ROWS = "numberOfRows";
public static final String NUMBER_OF_COLS = "numberOfCols";
}
public static class ImageProperties implements Serializable {
public static final String TRANSPARENT = "transparent";
public static final String IMAGE_TYPE = "imageType";
public static final String POSITION = "position";
public static final String ID = "id";
public static final String REPRESENTATION_HASH = "representationHash";
}
public static class TableCellProperties implements Serializable {
public static final String B_BOX = "bBox";
public static final String ROW = "row";
public static final String COL = "col";
public static final String HEADER = "header";
}
public static class DuplicateParagraphProperties implements Serializable {
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
}
public static final String RECTANGLE_DELIMITER = ";";
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER))
.map(Float::parseFloat)
.toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public static double[] parseRepresentationVector(String representationHash) {
String[] stringArray = representationHash.split("[,\\s]+");
double[] doubleArray = new double[stringArray.length];
for (int i = 0; i < stringArray.length; i++) {
doubleArray[i] = Double.parseDouble(stringArray[i]);
}
return doubleArray;
}
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return documentStructure.getRoot();
}
EntryData entry = documentStructure.getRoot().getChildrenList()
.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.getChildrenList()
.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return flatten(documentStructure.getRoot());
}
public String toString() {
return String.join("\n",
streamAllEntries().map(EntryData::toString)
.toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry),
entry.getChildrenList()
.stream()
.flatMap(DocumentStructureWrapper::flatten));
}
}

View File

@ -0,0 +1,176 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: LayoutEngine.proto
// Protobuf Java Version: 4.28.3
package com.iqser.red.service.redaction.v1.server.data;
public final class LayoutEngineProto {
private LayoutEngineProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
LayoutEngineProto.class.getName());
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions(
(com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code LayoutEngine}
*/
public enum LayoutEngine
implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>ALGORITHM = 0;</code>
*/
ALGORITHM(0),
/**
* <code>AI = 1;</code>
*/
AI(1),
/**
* <code>OUTLINE = 2;</code>
*/
OUTLINE(2),
UNRECOGNIZED(-1),
;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
LayoutEngine.class.getName());
}
/**
* <code>ALGORITHM = 0;</code>
*/
public static final int ALGORITHM_VALUE = 0;
/**
* <code>AI = 1;</code>
*/
public static final int AI_VALUE = 1;
/**
* <code>OUTLINE = 2;</code>
*/
public static final int OUTLINE_VALUE = 2;
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new java.lang.IllegalArgumentException(
"Can't get the number of an unknown enum value.");
}
return value;
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
@java.lang.Deprecated
public static LayoutEngine valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static LayoutEngine forNumber(int value) {
switch (value) {
case 0: return ALGORITHM;
case 1: return AI;
case 2: return OUTLINE;
default: return null;
}
}
public static com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>
internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<
LayoutEngine> internalValueMap =
new com.google.protobuf.Internal.EnumLiteMap<LayoutEngine>() {
public LayoutEngine findValueByNumber(int number) {
return LayoutEngine.forNumber(number);
}
};
public final com.google.protobuf.Descriptors.EnumValueDescriptor
getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new java.lang.IllegalStateException(
"Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues().get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor
getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor
getDescriptor() {
return com.iqser.red.service.redaction.v1.server.data.LayoutEngineProto.getDescriptor().getEnumTypes().get(0);
}
private static final LayoutEngine[] VALUES = values();
public static LayoutEngine valueOf(
com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new java.lang.IllegalArgumentException(
"EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private LayoutEngine(int value) {
this.value = value;
}
// @@protoc_insertion_point(enum_scope:LayoutEngine)
}
public static com.google.protobuf.Descriptors.FileDescriptor
getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor
descriptor;
static {
java.lang.String[] descriptorData = {
"\n\022LayoutEngine.proto*2\n\014LayoutEngine\022\r\n\t" +
"ALGORITHM\020\000\022\006\n\002AI\020\001\022\013\n\007OUTLINE\020\002BC\n.com." +
"iqser.red.service.redaction.v1.server.da" +
"taB\021LayoutEngineProtob\006proto3"
};
descriptor = com.google.protobuf.Descriptors.FileDescriptor
.internalBuildGeneratedFileFrom(descriptorData,
new com.google.protobuf.Descriptors.FileDescriptor[] {
});
descriptor.resolveAllFeaturesImmutable();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -0,0 +1,261 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: NodeType.proto
// Protobuf Java Version: 4.28.3
package com.iqser.red.service.redaction.v1.server.data;
public final class NodeTypeProto {
private NodeTypeProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
NodeTypeProto.class.getName());
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions(
(com.google.protobuf.ExtensionRegistryLite) registry);
}
/**
* Protobuf enum {@code NodeType}
*/
public enum NodeType
implements com.google.protobuf.ProtocolMessageEnum {
/**
* <code>DOCUMENT = 0;</code>
*/
DOCUMENT(0),
/**
* <code>SECTION = 1;</code>
*/
SECTION(1),
/**
* <code>SUPER_SECTION = 2;</code>
*/
SUPER_SECTION(2),
/**
* <code>HEADLINE = 3;</code>
*/
HEADLINE(3),
/**
* <code>PARAGRAPH = 4;</code>
*/
PARAGRAPH(4),
/**
* <code>TABLE = 5;</code>
*/
TABLE(5),
/**
* <code>TABLE_CELL = 6;</code>
*/
TABLE_CELL(6),
/**
* <code>IMAGE = 7;</code>
*/
IMAGE(7),
/**
* <code>HEADER = 8;</code>
*/
HEADER(8),
/**
* <code>FOOTER = 9;</code>
*/
FOOTER(9),
/**
* <code>TABLE_OF_CONTENTS = 10;</code>
*/
TABLE_OF_CONTENTS(10),
/**
* <code>TABLE_OF_CONTENTS_ITEM = 11;</code>
*/
TABLE_OF_CONTENTS_ITEM(11),
UNRECOGNIZED(-1),
;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
NodeType.class.getName());
}
/**
* <code>DOCUMENT = 0;</code>
*/
public static final int DOCUMENT_VALUE = 0;
/**
* <code>SECTION = 1;</code>
*/
public static final int SECTION_VALUE = 1;
/**
* <code>SUPER_SECTION = 2;</code>
*/
public static final int SUPER_SECTION_VALUE = 2;
/**
* <code>HEADLINE = 3;</code>
*/
public static final int HEADLINE_VALUE = 3;
/**
* <code>PARAGRAPH = 4;</code>
*/
public static final int PARAGRAPH_VALUE = 4;
/**
* <code>TABLE = 5;</code>
*/
public static final int TABLE_VALUE = 5;
/**
* <code>TABLE_CELL = 6;</code>
*/
public static final int TABLE_CELL_VALUE = 6;
/**
* <code>IMAGE = 7;</code>
*/
public static final int IMAGE_VALUE = 7;
/**
* <code>HEADER = 8;</code>
*/
public static final int HEADER_VALUE = 8;
/**
* <code>FOOTER = 9;</code>
*/
public static final int FOOTER_VALUE = 9;
/**
* <code>TABLE_OF_CONTENTS = 10;</code>
*/
public static final int TABLE_OF_CONTENTS_VALUE = 10;
/**
* <code>TABLE_OF_CONTENTS_ITEM = 11;</code>
*/
public static final int TABLE_OF_CONTENTS_ITEM_VALUE = 11;
public final int getNumber() {
if (this == UNRECOGNIZED) {
throw new java.lang.IllegalArgumentException(
"Can't get the number of an unknown enum value.");
}
return value;
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
* @deprecated Use {@link #forNumber(int)} instead.
*/
@java.lang.Deprecated
public static NodeType valueOf(int value) {
return forNumber(value);
}
/**
* @param value The numeric wire value of the corresponding enum entry.
* @return The enum associated with the given numeric wire value.
*/
public static NodeType forNumber(int value) {
switch (value) {
case 0: return DOCUMENT;
case 1: return SECTION;
case 2: return SUPER_SECTION;
case 3: return HEADLINE;
case 4: return PARAGRAPH;
case 5: return TABLE;
case 6: return TABLE_CELL;
case 7: return IMAGE;
case 8: return HEADER;
case 9: return FOOTER;
case 10: return TABLE_OF_CONTENTS;
case 11: return TABLE_OF_CONTENTS_ITEM;
default: return null;
}
}
public static com.google.protobuf.Internal.EnumLiteMap<NodeType>
internalGetValueMap() {
return internalValueMap;
}
private static final com.google.protobuf.Internal.EnumLiteMap<
NodeType> internalValueMap =
new com.google.protobuf.Internal.EnumLiteMap<NodeType>() {
public NodeType findValueByNumber(int number) {
return NodeType.forNumber(number);
}
};
public final com.google.protobuf.Descriptors.EnumValueDescriptor
getValueDescriptor() {
if (this == UNRECOGNIZED) {
throw new java.lang.IllegalStateException(
"Can't get the descriptor of an unrecognized enum value.");
}
return getDescriptor().getValues().get(ordinal());
}
public final com.google.protobuf.Descriptors.EnumDescriptor
getDescriptorForType() {
return getDescriptor();
}
public static final com.google.protobuf.Descriptors.EnumDescriptor
getDescriptor() {
return com.iqser.red.service.redaction.v1.server.data.NodeTypeProto.getDescriptor().getEnumTypes().get(0);
}
private static final NodeType[] VALUES = values();
public static NodeType valueOf(
com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
if (desc.getType() != getDescriptor()) {
throw new java.lang.IllegalArgumentException(
"EnumValueDescriptor is not for this type.");
}
if (desc.getIndex() == -1) {
return UNRECOGNIZED;
}
return VALUES[desc.getIndex()];
}
private final int value;
private NodeType(int value) {
this.value = value;
}
// @@protoc_insertion_point(enum_scope:NodeType)
}
public static com.google.protobuf.Descriptors.FileDescriptor
getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor
descriptor;
static {
java.lang.String[] descriptorData = {
"\n\016NodeType.proto*\306\001\n\010NodeType\022\014\n\010DOCUMEN" +
"T\020\000\022\013\n\007SECTION\020\001\022\021\n\rSUPER_SECTION\020\002\022\014\n\010H" +
"EADLINE\020\003\022\r\n\tPARAGRAPH\020\004\022\t\n\005TABLE\020\005\022\016\n\nT" +
"ABLE_CELL\020\006\022\t\n\005IMAGE\020\007\022\n\n\006HEADER\020\010\022\n\n\006FO" +
"OTER\020\t\022\025\n\021TABLE_OF_CONTENTS\020\n\022\032\n\026TABLE_O" +
"F_CONTENTS_ITEM\020\013B?\n.com.iqser.red.servi" +
"ce.redaction.v1.server.dataB\rNodeTypePro" +
"tob\006proto3"
};
descriptor = com.google.protobuf.Descriptors.FileDescriptor
.internalBuildGeneratedFileFrom(descriptorData,
new com.google.protobuf.Descriptors.FileDescriptor[] {
});
descriptor.resolveAllFeaturesImmutable();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -0,0 +1,606 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// NO CHECKED-IN PROTOBUF GENCODE
// source: Range.proto
// Protobuf Java Version: 4.28.3
package com.iqser.red.service.redaction.v1.server.data;
public final class RangeProto {
private RangeProto() {}
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
RangeProto.class.getName());
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistryLite registry) {
}
public static void registerAllExtensions(
com.google.protobuf.ExtensionRegistry registry) {
registerAllExtensions(
(com.google.protobuf.ExtensionRegistryLite) registry);
}
public interface RangeOrBuilder extends
// @@protoc_insertion_point(interface_extends:Range)
com.google.protobuf.MessageOrBuilder {
/**
* <pre>
* A start index.
* </pre>
*
* <code>int32 start = 1;</code>
* @return The start.
*/
int getStart();
/**
* <pre>
* An end index.
* </pre>
*
* <code>int32 end = 2;</code>
* @return The end.
*/
int getEnd();
}
/**
* Protobuf type {@code Range}
*/
public static final class Range extends
com.google.protobuf.GeneratedMessage implements
// @@protoc_insertion_point(message_implements:Range)
RangeOrBuilder {
private static final long serialVersionUID = 0L;
static {
com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion(
com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC,
/* major= */ 4,
/* minor= */ 28,
/* patch= */ 3,
/* suffix= */ "",
Range.class.getName());
}
// Use Range.newBuilder() to construct.
private Range(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
super(builder);
}
private Range() {
}
public static final com.google.protobuf.Descriptors.Descriptor
getDescriptor() {
return com.iqser.red.service.redaction.v1.server.data.RangeProto.internal_static_Range_descriptor;
}
@java.lang.Override
protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
internalGetFieldAccessorTable() {
return com.iqser.red.service.redaction.v1.server.data.RangeProto.internal_static_Range_fieldAccessorTable
.ensureFieldAccessorsInitialized(
com.iqser.red.service.redaction.v1.server.data.RangeProto.Range.class, com.iqser.red.service.redaction.v1.server.data.RangeProto.Range.Builder.class);
}
public static final int START_FIELD_NUMBER = 1;
private int start_ = 0;
/**
* <pre>
* A start index.
* </pre>
*
* <code>int32 start = 1;</code>
* @return The start.
*/
@java.lang.Override
public int getStart() {
return start_;
}
public static final int END_FIELD_NUMBER = 2;
private int end_ = 0;
/**
* <pre>
* An end index.
* </pre>
*
* <code>int32 end = 2;</code>
* @return The end.
*/
@java.lang.Override
public int getEnd() {
return end_;
}
private byte memoizedIsInitialized = -1;
@java.lang.Override
public final boolean isInitialized() {
byte isInitialized = memoizedIsInitialized;
if (isInitialized == 1) return true;
if (isInitialized == 0) return false;
memoizedIsInitialized = 1;
return true;
}
@java.lang.Override
public void writeTo(com.google.protobuf.CodedOutputStream output)
throws java.io.IOException {
if (start_ != 0) {
output.writeInt32(1, start_);
}
if (end_ != 0) {
output.writeInt32(2, end_);
}
getUnknownFields().writeTo(output);
}
@java.lang.Override
public int getSerializedSize() {
int size = memoizedSize;
if (size != -1) return size;
size = 0;
if (start_ != 0) {
size += com.google.protobuf.CodedOutputStream
.computeInt32Size(1, start_);
}
if (end_ != 0) {
size += com.google.protobuf.CodedOutputStream
.computeInt32Size(2, end_);
}
size += getUnknownFields().getSerializedSize();
memoizedSize = size;
return size;
}
@java.lang.Override
public boolean equals(final java.lang.Object obj) {
if (obj == this) {
return true;
}
if (!(obj instanceof com.iqser.red.service.redaction.v1.server.data.RangeProto.Range)) {
return super.equals(obj);
}
com.iqser.red.service.redaction.v1.server.data.RangeProto.Range other = (com.iqser.red.service.redaction.v1.server.data.RangeProto.Range) obj;
if (getStart()
!= other.getStart()) return false;
if (getEnd()
!= other.getEnd()) return false;
if (!getUnknownFields().equals(other.getUnknownFields())) return false;
return true;
}
@java.lang.Override
public int hashCode() {
if (memoizedHashCode != 0) {
return memoizedHashCode;
}
int hash = 41;
hash = (19 * hash) + getDescriptor().hashCode();
hash = (37 * hash) + START_FIELD_NUMBER;
hash = (53 * hash) + getStart();
hash = (37 * hash) + END_FIELD_NUMBER;
hash = (53 * hash) + getEnd();
hash = (29 * hash) + getUnknownFields().hashCode();
memoizedHashCode = hash;
return hash;
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(
java.nio.ByteBuffer data)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(
java.nio.ByteBuffer data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(
com.google.protobuf.ByteString data)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(
com.google.protobuf.ByteString data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(byte[] data)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(
byte[] data,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws com.google.protobuf.InvalidProtocolBufferException {
return PARSER.parseFrom(data, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(java.io.InputStream input)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseWithIOException(PARSER, input);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(
java.io.InputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseWithIOException(PARSER, input, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseDelimitedFrom(java.io.InputStream input)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseDelimitedWithIOException(PARSER, input);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseDelimitedFrom(
java.io.InputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseDelimitedWithIOException(PARSER, input, extensionRegistry);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(
com.google.protobuf.CodedInputStream input)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseWithIOException(PARSER, input);
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range parseFrom(
com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws java.io.IOException {
return com.google.protobuf.GeneratedMessage
.parseWithIOException(PARSER, input, extensionRegistry);
}
@java.lang.Override
public Builder newBuilderForType() { return newBuilder(); }
public static Builder newBuilder() {
return DEFAULT_INSTANCE.toBuilder();
}
public static Builder newBuilder(com.iqser.red.service.redaction.v1.server.data.RangeProto.Range prototype) {
return DEFAULT_INSTANCE.toBuilder().mergeFrom(prototype);
}
@java.lang.Override
public Builder toBuilder() {
return this == DEFAULT_INSTANCE
? new Builder() : new Builder().mergeFrom(this);
}
@java.lang.Override
protected Builder newBuilderForType(
com.google.protobuf.GeneratedMessage.BuilderParent parent) {
Builder builder = new Builder(parent);
return builder;
}
/**
* Protobuf type {@code Range}
*/
public static final class Builder extends
com.google.protobuf.GeneratedMessage.Builder<Builder> implements
// @@protoc_insertion_point(builder_implements:Range)
com.iqser.red.service.redaction.v1.server.data.RangeProto.RangeOrBuilder {
public static final com.google.protobuf.Descriptors.Descriptor
getDescriptor() {
return com.iqser.red.service.redaction.v1.server.data.RangeProto.internal_static_Range_descriptor;
}
@java.lang.Override
protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
internalGetFieldAccessorTable() {
return com.iqser.red.service.redaction.v1.server.data.RangeProto.internal_static_Range_fieldAccessorTable
.ensureFieldAccessorsInitialized(
com.iqser.red.service.redaction.v1.server.data.RangeProto.Range.class, com.iqser.red.service.redaction.v1.server.data.RangeProto.Range.Builder.class);
}
// Construct using com.iqser.red.service.redaction.v1.server.data.RangeProto.Range.newBuilder()
private Builder() {
}
private Builder(
com.google.protobuf.GeneratedMessage.BuilderParent parent) {
super(parent);
}
@java.lang.Override
public Builder clear() {
super.clear();
bitField0_ = 0;
start_ = 0;
end_ = 0;
return this;
}
@java.lang.Override
public com.google.protobuf.Descriptors.Descriptor
getDescriptorForType() {
return com.iqser.red.service.redaction.v1.server.data.RangeProto.internal_static_Range_descriptor;
}
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.RangeProto.Range getDefaultInstanceForType() {
return com.iqser.red.service.redaction.v1.server.data.RangeProto.Range.getDefaultInstance();
}
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.RangeProto.Range build() {
com.iqser.red.service.redaction.v1.server.data.RangeProto.Range result = buildPartial();
if (!result.isInitialized()) {
throw newUninitializedMessageException(result);
}
return result;
}
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.RangeProto.Range buildPartial() {
com.iqser.red.service.redaction.v1.server.data.RangeProto.Range result = new com.iqser.red.service.redaction.v1.server.data.RangeProto.Range(this);
if (bitField0_ != 0) { buildPartial0(result); }
onBuilt();
return result;
}
private void buildPartial0(com.iqser.red.service.redaction.v1.server.data.RangeProto.Range result) {
int from_bitField0_ = bitField0_;
if (((from_bitField0_ & 0x00000001) != 0)) {
result.start_ = start_;
}
if (((from_bitField0_ & 0x00000002) != 0)) {
result.end_ = end_;
}
}
@java.lang.Override
public Builder mergeFrom(com.google.protobuf.Message other) {
if (other instanceof com.iqser.red.service.redaction.v1.server.data.RangeProto.Range) {
return mergeFrom((com.iqser.red.service.redaction.v1.server.data.RangeProto.Range)other);
} else {
super.mergeFrom(other);
return this;
}
}
public Builder mergeFrom(com.iqser.red.service.redaction.v1.server.data.RangeProto.Range other) {
if (other == com.iqser.red.service.redaction.v1.server.data.RangeProto.Range.getDefaultInstance()) return this;
if (other.getStart() != 0) {
setStart(other.getStart());
}
if (other.getEnd() != 0) {
setEnd(other.getEnd());
}
this.mergeUnknownFields(other.getUnknownFields());
onChanged();
return this;
}
@java.lang.Override
public final boolean isInitialized() {
return true;
}
@java.lang.Override
public Builder mergeFrom(
com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws java.io.IOException {
if (extensionRegistry == null) {
throw new java.lang.NullPointerException();
}
try {
boolean done = false;
while (!done) {
int tag = input.readTag();
switch (tag) {
case 0:
done = true;
break;
case 8: {
start_ = input.readInt32();
bitField0_ |= 0x00000001;
break;
} // case 8
case 16: {
end_ = input.readInt32();
bitField0_ |= 0x00000002;
break;
} // case 16
default: {
if (!super.parseUnknownField(input, extensionRegistry, tag)) {
done = true; // was an endgroup tag
}
break;
} // default:
} // switch (tag)
} // while (!done)
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
throw e.unwrapIOException();
} finally {
onChanged();
} // finally
return this;
}
private int bitField0_;
private int start_ ;
/**
* <pre>
* A start index.
* </pre>
*
* <code>int32 start = 1;</code>
* @return The start.
*/
@java.lang.Override
public int getStart() {
return start_;
}
/**
* <pre>
* A start index.
* </pre>
*
* <code>int32 start = 1;</code>
* @param value The start to set.
* @return This builder for chaining.
*/
public Builder setStart(int value) {
start_ = value;
bitField0_ |= 0x00000001;
onChanged();
return this;
}
/**
* <pre>
* A start index.
* </pre>
*
* <code>int32 start = 1;</code>
* @return This builder for chaining.
*/
public Builder clearStart() {
bitField0_ = (bitField0_ & ~0x00000001);
start_ = 0;
onChanged();
return this;
}
private int end_ ;
/**
* <pre>
* An end index.
* </pre>
*
* <code>int32 end = 2;</code>
* @return The end.
*/
@java.lang.Override
public int getEnd() {
return end_;
}
/**
* <pre>
* An end index.
* </pre>
*
* <code>int32 end = 2;</code>
* @param value The end to set.
* @return This builder for chaining.
*/
public Builder setEnd(int value) {
end_ = value;
bitField0_ |= 0x00000002;
onChanged();
return this;
}
/**
* <pre>
* An end index.
* </pre>
*
* <code>int32 end = 2;</code>
* @return This builder for chaining.
*/
public Builder clearEnd() {
bitField0_ = (bitField0_ & ~0x00000002);
end_ = 0;
onChanged();
return this;
}
// @@protoc_insertion_point(builder_scope:Range)
}
// @@protoc_insertion_point(class_scope:Range)
private static final com.iqser.red.service.redaction.v1.server.data.RangeProto.Range DEFAULT_INSTANCE;
static {
DEFAULT_INSTANCE = new com.iqser.red.service.redaction.v1.server.data.RangeProto.Range();
}
public static com.iqser.red.service.redaction.v1.server.data.RangeProto.Range getDefaultInstance() {
return DEFAULT_INSTANCE;
}
private static final com.google.protobuf.Parser<Range>
PARSER = new com.google.protobuf.AbstractParser<Range>() {
@java.lang.Override
public Range parsePartialFrom(
com.google.protobuf.CodedInputStream input,
com.google.protobuf.ExtensionRegistryLite extensionRegistry)
throws com.google.protobuf.InvalidProtocolBufferException {
Builder builder = newBuilder();
try {
builder.mergeFrom(input, extensionRegistry);
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
throw e.setUnfinishedMessage(builder.buildPartial());
} catch (com.google.protobuf.UninitializedMessageException e) {
throw e.asInvalidProtocolBufferException().setUnfinishedMessage(builder.buildPartial());
} catch (java.io.IOException e) {
throw new com.google.protobuf.InvalidProtocolBufferException(e)
.setUnfinishedMessage(builder.buildPartial());
}
return builder.buildPartial();
}
};
public static com.google.protobuf.Parser<Range> parser() {
return PARSER;
}
@java.lang.Override
public com.google.protobuf.Parser<Range> getParserForType() {
return PARSER;
}
@java.lang.Override
public com.iqser.red.service.redaction.v1.server.data.RangeProto.Range getDefaultInstanceForType() {
return DEFAULT_INSTANCE;
}
}
private static final com.google.protobuf.Descriptors.Descriptor
internal_static_Range_descriptor;
private static final
com.google.protobuf.GeneratedMessage.FieldAccessorTable
internal_static_Range_fieldAccessorTable;
public static com.google.protobuf.Descriptors.FileDescriptor
getDescriptor() {
return descriptor;
}
private static com.google.protobuf.Descriptors.FileDescriptor
descriptor;
static {
java.lang.String[] descriptorData = {
"\n\013Range.proto\"#\n\005Range\022\r\n\005start\030\001 \001(\005\022\013\n" +
"\003end\030\002 \001(\005B<\n.com.iqser.red.service.reda" +
"ction.v1.server.dataB\nRangeProtob\006proto3"
};
descriptor = com.google.protobuf.Descriptors.FileDescriptor
.internalBuildGeneratedFileFrom(descriptorData,
new com.google.protobuf.Descriptors.FileDescriptor[] {
});
internal_static_Range_descriptor =
getDescriptor().getMessageTypes().get(0);
internal_static_Range_fieldAccessorTable = new
com.google.protobuf.GeneratedMessage.FieldAccessorTable(
internal_static_Range_descriptor,
new java.lang.String[] { "Start", "End", });
descriptor.resolveAllFeaturesImmutable();
}
// @@protoc_insertion_point(outer_class_scope)
}

View File

@ -0,0 +1,25 @@
package com.iqser.red.service.redaction.v1.server.data.old;
import java.io.Serializable;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentPage implements Serializable {
int number;
int height;
int width;
int rotation;
}

View File

@ -0,0 +1,24 @@
package com.iqser.red.service.redaction.v1.server.data.old;
import java.io.Serializable;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentPositionData implements Serializable {
Long id;
int[] stringIdxToPositionIdx;
float[][] positions;
}

View File

@ -0,0 +1,158 @@
package com.iqser.red.service.redaction.v1.server.data.old;
import java.awt.geom.Rectangle2D;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentStructure implements Serializable {
EntryData root;
public static class TableProperties implements Serializable {
public static final String NUMBER_OF_ROWS = "numberOfRows";
public static final String NUMBER_OF_COLS = "numberOfCols";
}
public static class ImageProperties implements Serializable {
public static final String TRANSPARENT = "transparent";
public static final String IMAGE_TYPE = "imageType";
public static final String POSITION = "position";
public static final String ID = "id";
public static final String REPRESENTATION_HASH = "representationHash";
}
public static class TableCellProperties implements Serializable {
public static final String B_BOX = "bBox";
public static final String ROW = "row";
public static final String COL = "col";
public static final String HEADER = "header";
}
public static class DuplicateParagraphProperties implements Serializable {
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
}
public static final String RECTANGLE_DELIMITER = ";";
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER))
.map(Float::parseFloat)
.toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public static double[] parseRepresentationVector(String representationHash) {
String[] stringArray = representationHash.split("[,\\s]+");
double[] doubleArray = new double[stringArray.length];
for (int i = 0; i < stringArray.length; i++) {
doubleArray[i] = Double.parseDouble(stringArray[i]);
}
return doubleArray;
}
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root;
}
EntryData entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(root), root.children.stream())
.flatMap(DocumentStructure::flatten);
}
public String toString() {
return String.join("\n",
streamAllEntries().map(EntryData::toString)
.toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry),
entry.children.stream()
.flatMap(DocumentStructure::flatten));
}
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public static class EntryData implements Serializable {
NodeType type;
int[] treeId;
Long[] atomicBlockIds;
Long[] pageNumbers;
Map<String, String> properties;
List<EntryData> children;
Set<LayoutEngine> engines;
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (int i : treeId) {
sb.append(i);
sb.append(",");
}
sb.delete(sb.length() - 1, sb.length());
sb.append("]: ");
sb.append(type);
sb.append(" atbs = ");
sb.append(atomicBlockIds.length);
return sb.toString();
}
}
}

View File

@ -0,0 +1,28 @@
package com.iqser.red.service.redaction.v1.server.data.old;
import java.io.Serializable;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Deprecated
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentTextData implements Serializable {
Long id;
Long page;
String searchText;
int numberOnPage;
int start;
int end;
int[] lineBreaks;
}

View File

@ -0,0 +1,8 @@
package com.iqser.red.service.redaction.v1.server.data.old;
@Deprecated
public enum LayoutEngine {
ALGORITHM,
AI,
OUTLINE
}

View File

@ -0,0 +1,24 @@
package com.iqser.red.service.redaction.v1.server.data.old;
import java.io.Serializable;
import java.util.Locale;
@Deprecated
public enum NodeType implements Serializable {
DOCUMENT,
SECTION,
SUPER_SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
}

View File

@ -0,0 +1,199 @@
package com.iqser.red.service.redaction.v1.server.mapper;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
import com.iqser.red.service.redaction.v1.server.data.DocumentPageProto.AllDocumentPages;
import com.iqser.red.service.redaction.v1.server.data.DocumentPageProto.DocumentPage;
import com.iqser.red.service.redaction.v1.server.data.DocumentPositionDataProto.AllDocumentPositionData;
import com.iqser.red.service.redaction.v1.server.data.DocumentPositionDataProto.DocumentPositionData;
import com.iqser.red.service.redaction.v1.server.data.DocumentPositionDataProto.DocumentPositionData.Position;
import com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto.DocumentStructure;
import com.iqser.red.service.redaction.v1.server.data.DocumentStructureWrapper;
import com.iqser.red.service.redaction.v1.server.data.DocumentTextDataProto.AllDocumentTextData;
import com.iqser.red.service.redaction.v1.server.data.DocumentTextDataProto.DocumentTextData;
import com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData;
import com.iqser.red.service.redaction.v1.server.data.LayoutEngineProto;
import com.iqser.red.service.redaction.v1.server.data.NodeTypeProto;
import com.iqser.red.service.redaction.v1.server.data.RangeProto;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentDataMapper {
public DocumentData toDocumentData(Document document) {
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.distinct()
.map(DocumentDataMapper::toAtomicTextBlockData)
.toList();
AllDocumentTextData allDocumentTextData = AllDocumentTextData.newBuilder().addAllDocumentTextData(documentTextData).build();
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.distinct()
.map(DocumentDataMapper::toAtomicPositionBlockData)
.toList();
AllDocumentPositionData allDocumentPositionData = AllDocumentPositionData.newBuilder().addAllDocumentPositionData(atomicPositionBlockData).build();
List<DocumentPage> documentPageData = document.getPages()
.stream()
.sorted(Comparator.comparingInt(Page::getNumber))
.map(DocumentDataMapper::toPageData)
.toList();
AllDocumentPages allDocumentPages = AllDocumentPages.newBuilder().addAllDocumentPages(documentPageData).build();
DocumentStructureWrapper tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
return DocumentData.builder()
.documentTextData(allDocumentTextData)
.documentPositionData(allDocumentPositionData)
.documentPages(allDocumentPages)
.documentStructureWrapper(tableOfContentsData)
.build();
}
private DocumentStructureWrapper toDocumentTreeData(DocumentTree documentTree) {
return new DocumentStructureWrapper(DocumentStructure.newBuilder().setRoot(toEntryData(documentTree.getRoot())).build());
}
private EntryData toEntryData(DocumentTree.Entry entry) {
List<Long> atomicTextBlocks;
if (entry.getNode().isLeaf()) {
atomicTextBlocks = toAtomicTextBlockIds(entry.getNode().getLeafTextBlock());
} else {
atomicTextBlocks = new ArrayList<>();
}
Map<String, String> properties = switch (entry.getType()) {
case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode());
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode());
case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode());
case PARAGRAPH ->
entry.getNode() instanceof DuplicatedParagraph duplicatedParagraph ? PropertiesMapper.buildDuplicateParagraphProperties(duplicatedParagraph) : new HashMap<>();
default -> new HashMap<>();
};
var documentBuilder = EntryData.newBuilder()
.addAllTreeId(entry.getTreeId())
.addAllChildren(entry.getChildren()
.stream()
.map(DocumentDataMapper::toEntryData)
.toList())
.setType(resolveType(entry.getType()))
.addAllAtomicBlockIds(atomicTextBlocks)
.addAllPageNumbers(entry.getNode().getPages()
.stream()
.map(Page::getNumber)
.map(Integer::longValue)
.toList())
.putAllProperties(properties);
if (entry.getNode() != null) {
documentBuilder.addAllEngines(entry.getNode().getEngines()
.stream()
.map(engine -> LayoutEngineProto.LayoutEngine.valueOf(engine.name()))
.toList());
} else {
documentBuilder.addAllEngines(new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM)));
}
return documentBuilder.build();
}
private static NodeTypeProto.NodeType resolveType(NodeType type) {
return NodeTypeProto.NodeType.valueOf(type.name());
}
private List<Long> toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks()
.stream()
.map(AtomicTextBlock::getId)
.toList();
}
private DocumentPage toPageData(Page p) {
return DocumentPage.newBuilder().setRotation(p.getRotation()).setHeight(p.getHeight()).setWidth(p.getWidth()).setNumber(p.getNumber()).build();
}
private DocumentTextData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentTextData.newBuilder()
.setId(atomicTextBlock.getId())
.setPage(atomicTextBlock.getPage().getNumber().longValue())
.setSearchText(atomicTextBlock.getSearchText())
.setNumberOnPage(atomicTextBlock.getNumberOnPage())
.setStart(atomicTextBlock.getTextRange().start())
.setEnd(atomicTextBlock.getTextRange().end())
.addAllLineBreaks(atomicTextBlock.getLineBreaks())
.addAllItalicTextRanges(atomicTextBlock.getItalicTextRanges()
.stream()
.map(r -> RangeProto.Range.newBuilder().setStart(r.start()).setEnd(r.end()).build())
.toList())
.addAllBoldTextRanges(atomicTextBlock.getBoldTextRanges()
.stream()
.map(r -> RangeProto.Range.newBuilder().setStart(r.start()).setEnd(r.end()).build())
.toList())
.build();
}
private DocumentPositionData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentPositionData.newBuilder()
.setId(atomicTextBlock.getId())
.addAllPositions(toPositions(atomicTextBlock.getPositions()))
.addAllStringIdxToPositionIdx(atomicTextBlock.getStringIdxToPositionIdx())
.build();
}
private static List<Position> toPositions(List<Rectangle2D> rects) {
List<Position> positions = new ArrayList<>();
for (Rectangle2D rect : rects) {
positions.add(toPosition(rect));
}
return positions;
}
private static Position toPosition(Rectangle2D rect) {
return Position.newBuilder().addValue((float) rect.getMinX()).addValue((float) rect.getMinY()).addValue((float) rect.getWidth()).addValue((float) rect.getHeight()).build();
}
}

View File

@ -1,17 +1,18 @@
package com.iqser.red.service.redaction.v1.server.service.document;
package com.iqser.red.service.redaction.v1.server.mapper;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import static com.iqser.red.service.redaction.v1.server.data.DocumentPageProto.DocumentPage;
import static com.iqser.red.service.redaction.v1.server.data.DocumentPositionDataProto.AllDocumentPositionData;
import static com.iqser.red.service.redaction.v1.server.data.DocumentTextDataProto.AllDocumentTextData;
import static com.iqser.red.service.redaction.v1.server.data.EntryDataProto.EntryData;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentData;
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
@ -19,6 +20,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
@ -26,6 +28,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNo
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContents;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContentsItem;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
@ -41,11 +45,6 @@ public class DocumentGraphMapper {
DocumentTree documentTree = new DocumentTree(document);
Context context = new Context(documentData, documentTree);
context.pageData.addAll(documentData.getDocumentPages().getDocumentPagesList()
.stream()
.map(DocumentGraphMapper::buildPage)
.toList());
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildrenList(), context));
document.setDocumentTree(context.documentTree);
@ -70,20 +69,21 @@ public class DocumentGraphMapper {
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case SUPER_SECTION -> buildSuperSection(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case PARAGRAPH -> buildParagraph(context, entryData.getPropertiesMap());
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList());
case TABLE -> buildTable(context, entryData.getPropertiesMap());
case TABLE_CELL -> buildTableCell(context, entryData.getPropertiesMap());
case IMAGE -> buildImage(context, entryData.getPropertiesMap(), entryData.getPageNumbersList());
case TABLE_OF_CONTENTS -> buildTableOfContents(context);
case TABLE_OF_CONTENTS_ITEM -> buildTableOfContentsItem(context);
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
if (entryData.getAtomicBlockIdsCount() > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIdsList(), context, node);
node.setLeafTextBlock(textBlock);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
@ -91,15 +91,30 @@ public class DocumentGraphMapper {
default -> textBlock.getAtomicTextBlocks()
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
}
}
List<Integer> treeId = entryData.getTreeIdList();
entryData.getEnginesList()
.forEach(node::addEngine);
node.setTreeId(treeId);
entryData.getEnginesList()
.stream()
.map(engine -> LayoutEngine.valueOf(engine.name()))
.forEach(node::addEngine);
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildrenList(), context)).node(node).build());
} return newEntries;
}
return newEntries;
}
private static TableOfContents buildTableOfContents(Context context) {
return TableOfContents.builder().documentTree(context.documentTree).build();
}
private static TableOfContentsItem buildTableOfContentsItem(Context context) {
return TableOfContentsItem.builder().documentTree(context.documentTree).build();
}
@ -208,14 +223,18 @@ public class DocumentGraphMapper {
Context(DocumentData documentData, DocumentTree documentTree) {
this.documentTree = documentTree;
this.pageData = new ArrayList<>();
this.pageData = documentData.getDocumentPages().getDocumentPagesList()
.stream()
.map(DocumentGraphMapper::buildPage)
.sorted(Comparator.comparingInt(Page::getNumber))
.toList();
this.documentTextData = documentData.getDocumentTextData();
this.documentPositionData = documentData.getDocumentPositionData();
}
private Page getPage(Long pageIndex) {
public Page getPage(Long pageIndex) {
Page page = pageData.get(Math.toIntExact(pageIndex) - 1);
assert page.getNumber() == Math.toIntExact(pageIndex);
@ -225,3 +244,5 @@ public class DocumentGraphMapper {
}
}

View File

@ -0,0 +1,152 @@
package com.iqser.red.service.redaction.v1.server.mapper;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import com.iqser.red.service.redaction.v1.server.data.DocumentStructureWrapper;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PropertiesMapper {
public static Map<String, String> buildImageProperties(Image image) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE, image.getImageType().name());
properties.put(DocumentStructureWrapper.ImageProperties.TRANSPARENT, String.valueOf(image.isTransparent()));
properties.put(DocumentStructureWrapper.ImageProperties.POSITION, toString(image.getPosition()));
properties.put(DocumentStructureWrapper.ImageProperties.ID, image.getId());
properties.put(DocumentStructureWrapper.ImageProperties.REPRESENTATION_HASH, image.getRepresentationHash());
return properties;
}
public static Map<String, String> buildTableCellProperties(TableCell tableCell) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructureWrapper.TableCellProperties.ROW, String.valueOf(tableCell.getRow()));
properties.put(DocumentStructureWrapper.TableCellProperties.COL, String.valueOf(tableCell.getCol()));
properties.put(DocumentStructureWrapper.TableCellProperties.HEADER, String.valueOf(tableCell.isHeader()));
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
throw new IllegalArgumentException("TableCell can only occur on a single page!");
}
String bBoxString = toString(tableCell.getBBox()
.get(tableCell.getPages()
.stream()
.findFirst()
.get()));
properties.put(DocumentStructureWrapper.TableCellProperties.B_BOX, bBoxString);
return properties;
}
public static Map<String, String> buildTableProperties(Table table) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows()));
properties.put(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols()));
return properties;
}
public static void parseImageProperties(Map<String, String> properties, Image.ImageBuilder<?, ?> builder) {
builder.imageType(parseImageType(properties.get(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE)));
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.ImageProperties.TRANSPARENT)));
builder.position(DocumentStructureWrapper.parseRectangle2D(properties.get(DocumentStructureWrapper.ImageProperties.POSITION)));
builder.id(properties.get(DocumentStructureWrapper.ImageProperties.ID));
}
public static void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder<?, ?> builder) {
builder.row(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.ROW)));
builder.col(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.COL)));
builder.header(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.TableCellProperties.HEADER)));
builder.bBox(DocumentStructureWrapper.parseRectangle2D(properties.get(DocumentStructureWrapper.TableCellProperties.B_BOX)));
}
public static void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS)));
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS)));
}
public static Map<String, String> buildDuplicateParagraphProperties(DuplicatedParagraph duplicatedParagraph) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID,
Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock())));
return properties;
}
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static List<Long> getUnsortedTextblockIds(Map<String, String> properties) {
return toLongList(properties.get(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static List<Long> toLongList(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
.map(Long::valueOf)
.toList();
}
private static ImageType parseImageType(String imageType) {
try {
return ImageType.valueOf(imageType.toUpperCase(Locale.ROOT));
} catch (IllegalArgumentException e) {
return ImageType.OTHER;
}
}
public static String toString(Rectangle2D rectangle2D) {
return String.format(Locale.US,
"%f%s%f%s%f%s%f",
rectangle2D.getX(),
DocumentStructureWrapper.RECTANGLE_DELIMITER,
rectangle2D.getY(),
DocumentStructureWrapper.RECTANGLE_DELIMITER,
rectangle2D.getWidth(),
DocumentStructureWrapper.RECTANGLE_DELIMITER,
rectangle2D.getHeight());
}
private static Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks()
.stream()
.map(AtomicTextBlock::getId)
.toArray(Long[]::new);
}
}

View File

@ -0,0 +1,116 @@
package com.iqser.red.service.redaction.v1.server.model.document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContents;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContentsItem;
public abstract class AbstractNodeVisitor implements NodeVisitor {
@Override
public void visit(Document document) {
defaultVisit(document);
}
@Override
public void visit(SuperSection superSection) {
defaultVisit(superSection);
}
@Override
public void visit(Section section) {
defaultVisit(section);
}
@Override
public void visit(Headline headline) {
defaultVisit(headline);
}
@Override
public void visit(Paragraph paragraph) {
defaultVisit(paragraph);
}
@Override
public void visit(Footer footer) {
defaultVisit(footer);
}
@Override
public void visit(Header header) {
defaultVisit(header);
}
@Override
public void visit(Image image) {
defaultVisit(image);
}
@Override
public void visit(Table table) {
defaultVisit(table);
}
@Override
public void visit(TableCell tableCell) {
defaultVisit(tableCell);
}
@Override
public void visit(TableOfContents toc) {
defaultVisit(toc);
}
@Override
public void visit(TableOfContentsItem toci) {
defaultVisit(toci);
}
public void visitNodeDefault(SemanticNode node) {
// By default, it does nothing
}
protected void defaultVisit(SemanticNode semanticNode) {
visitNodeDefault(semanticNode);
semanticNode.streamChildren()
.forEach(node -> node.accept(this));
}
}

View File

@ -9,6 +9,8 @@ import java.util.List;
import java.util.Optional;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
@ -17,6 +19,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import com.iqser.red.service.redaction.v1.server.utils.EntityCreationUtility;
import com.iqser.red.service.redaction.v1.server.utils.EntityEnrichmentService;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -35,7 +39,7 @@ public class DocumentTree {
public DocumentTree(Document document) {
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
this.root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
}
@ -296,6 +300,22 @@ public class DocumentTree {
}
public Optional<Entry> findEntryById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return Optional.of(root);
}
Entry entry = root;
for (int id : treeId) {
if (id < 0 || id >= entry.children.size()) {
return Optional.empty();
}
entry = entry.children.get(id);
}
return Optional.of(entry);
}
public Stream<Entry> mainEntries() {
return root.children.stream();
@ -342,6 +362,25 @@ public class DocumentTree {
}
public void addEntityToGraph(TextEntity entity) {
getRoot().getNode().addThisToEntityIfIntersects(entity);
TextBlock textBlock = entity.getDeepestFullyContainingNode().getTextBlock();
EntityEnrichmentService.enrichEntity(entity, textBlock);
EntityCreationUtility.addToPages(entity);
EntityCreationUtility.addEntityToNodeEntitySets(entity);
if (entity.getEntityType().equals(EntityType.TEMPORARY)) {
return;
}
entity.computeRelations();
entity.notifyEntityInserted();
}
@Builder
@Getter
@AllArgsConstructor

View File

@ -0,0 +1,32 @@
package com.iqser.red.service.redaction.v1.server.model.document;
import java.util.HashSet;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import lombok.Getter;
public class IntersectingNodeVisitor extends AbstractNodeVisitor {
@Getter
private Set<SemanticNode> intersectingNodes;
private final TextRange textRange;
public IntersectingNodeVisitor(TextRange textRange) {
this.textRange = textRange;
this.intersectingNodes = new HashSet<>();
}
@Override
public void visitNodeDefault(SemanticNode node) {
if (textRange.intersects(node.getTextRange())) {
intersectingNodes.add(node);
}
}
}

View File

@ -0,0 +1,53 @@
package com.iqser.red.service.redaction.v1.server.model.document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContents;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContentsItem;
public interface NodeVisitor {
void visit(Document document);
void visit(SuperSection superSection);
void visit(Section section);
void visit(Headline headline);
void visit(Paragraph paragraph);
void visit(Footer footer);
void visit(Header header);
void visit(Image image);
void visit(Table table);
void visit(TableCell tableCell);
void visit(TableOfContents tableOfContents);
void visit(TableOfContentsItem tableOfContentsItem);
}

View File

@ -134,6 +134,12 @@ public class TextRange implements Comparable<TextRange> {
}
public boolean containsExclusive(int index) {
return start <= index && index < end;
}
/**
* Checks if this {@link TextRange} intersects with another {@link TextRange}.
*

View File

@ -0,0 +1,20 @@
package com.iqser.red.service.redaction.v1.server.model.document.entity;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
@Getter
@RequiredArgsConstructor
public abstract class AbstractRelation implements Relation {
protected final TextEntity a;
protected final TextEntity b;
@Override
public String toString() {
return this.getClass().getSimpleName() + "{" + "a=" + a + ", b=" + b + '}';
}
}

View File

@ -0,0 +1,18 @@
package com.iqser.red.service.redaction.v1.server.model.document.entity;
public class Containment extends Intersection {
public Containment(TextEntity container, TextEntity contained) {
super(container, contained);
}
public TextEntity getContainer() {
return a;
}
public TextEntity getContained() {
return b;
}
}

View File

@ -0,0 +1,25 @@
package com.iqser.red.service.redaction.v1.server.model.document.entity;
public interface EntityEventListener {
/**
* Invoked when an entity is inserted.
*
* @param entity The entity that was inserted.
*/
void onEntityInserted(IEntity entity);
/**
* Invoked when an entity is updated.
*
* @param entity The entity that was updated.
*/
void onEntityUpdated(IEntity entity);
/**
* Invoked when an entity is removed.
*
* @param entity The entity that was removed.
*/
void onEntityRemoved(IEntity entity);
}

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.model.document.entity;
public class Equality extends Containment {
public Equality(TextEntity a, TextEntity b) {
super(a, b);
}
}

View File

@ -6,7 +6,6 @@ import java.util.PriorityQueue;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.drools.RuleIdentifier;
import lombok.NonNull;
@ -95,6 +94,7 @@ public interface IEntity {
*/
// Don't use default accessor pattern (e.g. isApplied()), as it might lead to errors in drools due to property-specific optimization of the drools planner.
default boolean applied() {
if (this.getMatchedRule().isHigherPriorityThanManual()) {
return getMatchedRule().isApplied();
}
@ -120,6 +120,7 @@ public interface IEntity {
* @return True if ignored, false otherwise.
*/
default boolean ignored() {
if (this.getMatchedRule().isHigherPriorityThanManual()) {
return getMatchedRule().isIgnored();
}
@ -135,10 +136,11 @@ public interface IEntity {
* @return True if removed, false otherwise.
*/
default boolean removed() {
if (this.getMatchedRule().isHigherPriorityThanManual()) {
return getMatchedRule().isRemoved();
}
return getManualOverwrite().getRemoved()
if (this.getMatchedRule().isHigherPriorityThanManual()) {
return getMatchedRule().isRemoved();
}
return getManualOverwrite().getRemoved()
.orElse(getMatchedRule().isRemoved());
}
@ -149,6 +151,7 @@ public interface IEntity {
* @return True if resized, false otherwise.
*/
default boolean resized() {
if (this.getMatchedRule().isHigherPriorityThanManual()) {
return getMatchedRule().isRemoved();
}
@ -336,7 +339,9 @@ public interface IEntity {
*/
default void addMatchedRule(MatchedRule matchedRule) {
boolean wasValid = valid();
getMatchedRuleList().add(matchedRule);
handleStateChange(wasValid);
}
@ -350,7 +355,53 @@ public interface IEntity {
if (getMatchedRuleList().equals(matchedRules)) {
return;
}
boolean wasValid = valid();
getMatchedRuleList().addAll(matchedRules);
handleStateChange(wasValid);
}
void addEntityEventListener(EntityEventListener listener);
void removeEntityEventListener(EntityEventListener listener);
default void notifyEntityInserted() {
for (EntityEventListener listener : getEntityEventListeners()) {
listener.onEntityInserted(this);
}
}
default void notifyEntityUpdated() {
for (EntityEventListener listener : getEntityEventListeners()) {
listener.onEntityUpdated(this);
}
}
default void notifyEntityRemoved() {
for (EntityEventListener listener : getEntityEventListeners()) {
listener.onEntityRemoved(this);
}
}
Collection<EntityEventListener> getEntityEventListeners();
default void handleStateChange(boolean wasValid) {
if (valid() == wasValid) {
return;
}
if (!removed()) {
notifyEntityUpdated();
} else {
notifyEntityRemoved();
}
}
@ -384,15 +435,9 @@ public interface IEntity {
*
* @return The built reason string.
*/
default String buildReasonWithManualChangeDescriptions() {
default String buildReason() {
if (getManualOverwrite().getDescriptions().isEmpty()) {
return getMatchedRule().getReason();
}
if (getMatchedRule().getReason().isEmpty()) {
return String.join(", ", getManualOverwrite().getDescriptions());
}
return getMatchedRule().getReason() + ", " + String.join(", ", getManualOverwrite().getDescriptions());
return getMatchedRule().getReason();
}

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.model.document.entity;
public class Intersection extends AbstractRelation {
public Intersection(TextEntity a, TextEntity b) {
super(a, b);
}
}

View File

@ -1,10 +1,8 @@
package com.iqser.red.service.redaction.v1.server.model.document.entity;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.BaseAnnotation;
@ -14,7 +12,6 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualRecategorization;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualRedactionEntry;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualResizeRedaction;
import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -26,18 +23,9 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ManualChangeOverwrite {
private static final Map<Class<? extends BaseAnnotation>, String> MANUAL_CHANGE_DESCRIPTIONS = Map.of(//
ManualRedactionEntry.class, "created by manual change", //
ManualLegalBasisChange.class, "legal basis was manually changed", //
ManualResizeRedaction.class, "resized by manual override", //
ManualForceRedaction.class, "forced by manual override", //
IdRemoval.class, "removed by manual override", //
ManualRecategorization.class, "recategorized by manual override");
@Builder.Default
List<BaseAnnotation> manualChanges = new LinkedList<>();
boolean changed;
List<String> descriptions;
String type;
String legalBasis;
String section;
@ -63,6 +51,7 @@ public class ManualChangeOverwrite {
this.manualChanges = new LinkedList<>();
}
public ManualChangeOverwrite(EntityType entityType, String section) {
this(entityType);
@ -95,8 +84,6 @@ public class ManualChangeOverwrite {
private void updateFields(List<BaseAnnotation> sortedManualChanges) {
descriptions = new LinkedList<>();
for (BaseAnnotation manualChange : sortedManualChanges) {
// ManualRedactionEntries are created prior to rule execution in analysis service.
@ -151,8 +138,6 @@ public class ManualChangeOverwrite {
legalBasis = recategorization.getLegalBasis();
}
}
descriptions.add(MANUAL_CHANGE_DESCRIPTIONS.get(manualChange.getClass()));
}
changed = false;
}
@ -245,13 +230,6 @@ public class ManualChangeOverwrite {
}
public List<String> getDescriptions() {
calculateCurrentOverride();
return descriptions == null ? Collections.emptyList() : descriptions;
}
public Optional<List<RectangleWithPage>> getPositions() {
calculateCurrentOverride();

View File

@ -5,9 +5,6 @@ import java.util.List;
import java.util.Objects;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.drools.RuleIdentifier;
import com.iqser.red.service.redaction.v1.server.model.drools.RuleType;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -30,7 +27,7 @@ public final class MatchedRule implements Comparable<MatchedRule> {
public static final RuleType IMPORTED_TYPE = RuleType.fromString("IMP");
public static final RuleType MANUAL_TYPE = RuleType.fromString("MAN");
public static final RuleType DICTIONARY_TYPE = RuleType.fromString("DICT");
private static final List<RuleType> RULE_TYPE_PRIORITIES = List.of(FINAL_TYPE, ELIMINATION_RULE_TYPE, IMPORTED_TYPE, MANUAL_TYPE, DICTIONARY_TYPE);
private static final List<RuleType> RULE_TYPE_PRIORITIES = List.of(FINAL_TYPE, ELIMINATION_RULE_TYPE, MANUAL_TYPE, IMPORTED_TYPE, DICTIONARY_TYPE);
RuleIdentifier ruleIdentifier;
@Builder.Default
@ -57,11 +54,14 @@ public final class MatchedRule implements Comparable<MatchedRule> {
return MatchedRule.builder().ruleIdentifier(RuleIdentifier.empty()).build();
}
public boolean isHigherPriorityThanManual() {
return (-1 < RULE_TYPE_PRIORITIES.indexOf(this.ruleIdentifier.type())) &&
(RULE_TYPE_PRIORITIES.indexOf(this.ruleIdentifier.type()) < RULE_TYPE_PRIORITIES.indexOf(MANUAL_TYPE));
return (-1 < RULE_TYPE_PRIORITIES.indexOf(this.ruleIdentifier.type())) && (RULE_TYPE_PRIORITIES.indexOf(this.ruleIdentifier.type()) < RULE_TYPE_PRIORITIES.indexOf(
MANUAL_TYPE));
}
/**
* Returns a modified instance of {@link MatchedRule} based on its applied status.
* If the rule has been applied, it returns a new {@link MatchedRule} instance that retains all properties of the original

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.model;
package com.iqser.red.service.redaction.v1.server.model.document.entity;
import java.awt.geom.Rectangle2D;

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.model.document.entity;
public interface Relation {
TextEntity getA();
TextEntity getB();
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.model.drools;
package com.iqser.red.service.redaction.v1.server.model.document.entity;
import java.util.Objects;

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.model.drools;
package com.iqser.red.service.redaction.v1.server.model.document.entity;
import java.util.regex.Pattern;

View File

@ -4,6 +4,7 @@ import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
@ -11,7 +12,10 @@ import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import org.apache.commons.collections4.map.HashedMap;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Engine;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.BaseAnnotation;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
@ -43,13 +47,14 @@ public class TextEntity implements IEntity {
TextRange textRange;
@Builder.Default
List<TextRange> duplicateTextRanges = new ArrayList<>();
Set<TextRange> duplicateTextRanges = new HashSet<>();
String type; // TODO: make final once ManualChangesApplicationService::recategorize is deleted
final EntityType entityType;
@Builder.Default
final PriorityQueue<MatchedRule> matchedRuleList = new PriorityQueue<>();
final ManualChangeOverwrite manualOverwrite;
@Builder.Default
final ManualChangeOverwrite manualOverwrite = new ManualChangeOverwrite();
boolean dictionaryEntry;
boolean dossierDictionaryEntry;
@ -68,6 +73,12 @@ public class TextEntity implements IEntity {
List<SemanticNode> intersectingNodes = new LinkedList<>();
SemanticNode deepestFullyContainingNode;
@Builder.Default
Map<TextEntity, Set<Relation>> relations = new HashMap<>();
@Builder.Default
Collection<EntityEventListener> entityEventListeners = new ArrayList<>();
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, SemanticNode node) {
@ -158,12 +169,15 @@ public class TextEntity implements IEntity {
public void removeFromGraph() {
remove("FINAL.0.0", "removed completely");
intersectingNodes.forEach(node -> node.getEntities().remove(this));
pages.forEach(page -> page.getEntities().remove(this));
intersectingNodes = new LinkedList<>();
relations.keySet()
.forEach(entity -> entity.getRelations().remove(this));
relations = new HashedMap<>();
deepestFullyContainingNode = null;
pages = new HashSet<>();
remove("FINAL.0.0", "removed completely");
}
@ -201,22 +215,20 @@ public class TextEntity implements IEntity {
return textEntity.contains(this);
}
public boolean contains(TextEntity textEntity) {
if (this.textRange.contains(textEntity.getTextRange())) {
return true;
}
List<TextRange> textEntityDuplicateRanges = textEntity.getDuplicateTextRanges();
// use optimized indexed loops for extra performance boost
for (int i = 0, duplicateTextRangesSize = duplicateTextRanges.size(); i < duplicateTextRangesSize; i++) {
TextRange duplicateTextRange = duplicateTextRanges.get(i);
Set<TextRange> textEntityDuplicateRanges = textEntity.getDuplicateTextRanges();
for (TextRange duplicateTextRange : this.duplicateTextRanges) {
if (duplicateTextRange.contains(textEntity.getTextRange())) {
return true;
}
for (int j = 0, textEntityDuplicateRangesSize = textEntityDuplicateRanges.size(); j < textEntityDuplicateRangesSize; j++) {
TextRange otherRange = textEntityDuplicateRanges.get(j);
for (TextRange otherRange : textEntityDuplicateRanges) {
if (duplicateTextRange.contains(otherRange)) {
return true;
}
@ -227,6 +239,7 @@ public class TextEntity implements IEntity {
}
public boolean intersects(TextEntity textEntity) {
return this.textRange.intersects(textEntity.getTextRange()) //
@ -251,6 +264,20 @@ public class TextEntity implements IEntity {
}
public void addManualChange(BaseAnnotation manualChange) {
manualOverwrite.addChange(manualChange);
notifyEntityUpdated();
}
public void addManualChanges(List<BaseAnnotation> manualChanges) {
manualOverwrite.addChanges(manualChanges);
notifyEntityUpdated();
}
public boolean matchesAnnotationId(String manualRedactionId) {
return getPositionsOnPagePerPage().stream()
@ -311,4 +338,42 @@ public class TextEntity implements IEntity {
.orElse(getMatchedRule().isWriteValueWithLineBreaks() ? getValueWithLineBreaks() : value);
}
@Override
public void addEntityEventListener(EntityEventListener listener) {
entityEventListeners.add(listener);
}
@Override
public void removeEntityEventListener(EntityEventListener listener) {
entityEventListeners.remove(listener);
}
public void computeRelations() {
for (TextEntity textEntity : this.getDeepestFullyContainingNode().getEntities()) {
if (this.intersects(textEntity) && !this.equals(textEntity) && !textEntity.getEntityType().equals(EntityType.TEMPORARY)) {
if (textEntity.getTextRange().equals(this.getTextRange())) {
textEntity.getRelations().computeIfAbsent(this, k -> new HashSet<>()).add(new Equality(this, textEntity));
this.getRelations().computeIfAbsent(textEntity, k -> new HashSet<>()).add(new Equality(textEntity, this));
} else if (textEntity.containedBy(this)) {
textEntity.getRelations().computeIfAbsent(this, k -> new HashSet<>()).add(new Intersection(textEntity, this));
this.getRelations().computeIfAbsent(textEntity, k -> new HashSet<>()).add(new Containment(this, textEntity));
} else if (this.containedBy(textEntity)) {
textEntity.getRelations().computeIfAbsent(this, k -> new HashSet<>()).add(new Containment(textEntity, this));
this.getRelations().computeIfAbsent(textEntity, k -> new HashSet<>()).add(new Intersection(this, textEntity));
} else {
textEntity.getRelations().computeIfAbsent(this, k -> new HashSet<>()).add(new Intersection(textEntity, this));
this.getRelations().computeIfAbsent(textEntity, k -> new HashSet<>()).add(new Intersection(this, textEntity));
}
}
}
}
}

View File

@ -9,7 +9,6 @@ import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;

View File

@ -9,8 +9,8 @@ import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.ConsecutiveTextBlockCollector;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.AccessLevel;
@ -39,7 +39,6 @@ public class Document extends AbstractSemanticNode {
@Builder.Default
static final SectionIdentifier sectionIdentifier = SectionIdentifier.document();
@Override
public NodeType getType() {
@ -169,4 +168,11 @@ public class Document extends AbstractSemanticNode {
return bBox;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.AccessLevel;
@ -57,4 +58,11 @@ public class Footer extends AbstractSemanticNode {
return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.AccessLevel;
@ -60,4 +61,11 @@ public class Header extends AbstractSemanticNode {
return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
@ -99,4 +100,11 @@ public class Headline extends AbstractSemanticNode {
.isPresent();
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -1,6 +1,8 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
@ -8,7 +10,9 @@ import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityEventListener;
import com.iqser.red.service.redaction.v1.server.model.document.entity.IEntity;
import com.iqser.red.service.redaction.v1.server.model.document.entity.ManualChangeOverwrite;
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
@ -24,8 +28,7 @@ import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
*
Represents an image within the document.
* Represents an image within the document.
*/
@Data
@SuperBuilder
@ -37,6 +40,7 @@ public class Image extends AbstractSemanticNode implements IEntity {
String id;
String representationHash;
TextBlock leafTextBlock;
ImageType imageType;
@ -51,6 +55,9 @@ public class Image extends AbstractSemanticNode implements IEntity {
Page page;
@Builder.Default
Collection<EntityEventListener> entityEventListeners = new ArrayList<>();
@Override
public NodeType getType() {
@ -73,6 +80,18 @@ public class Image extends AbstractSemanticNode implements IEntity {
}
public boolean isFullPageImage() {
return imageType.equals(ImageType.OCR) || getArea() >= 0.5 * page.getArea();
}
private double getArea() {
return position.getWidth() * position.getHeight();
}
@Override
public TextRange getTextRange() {
@ -87,17 +106,33 @@ public class Image extends AbstractSemanticNode implements IEntity {
}
@Override
public void addEntityEventListener(EntityEventListener listener) {
entityEventListeners.add(listener);
}
@Override
public void removeEntityEventListener(EntityEventListener listener) {
entityEventListeners.remove(listener);
}
@Override
public String type() {
return getManualOverwrite().getType().orElse(imageType.toString().toLowerCase(Locale.ENGLISH));
return getManualOverwrite().getType()
.orElse(imageType.toString().toLowerCase(Locale.ENGLISH));
}
@Override
public String toString() {
return getTreeId() + ": " + getValue() + " " + position;
return getTreeId() + ": " + getValue() + " [%.2f,%.2f,%.2f,%.2f]".formatted(position.getX(), position.getY(), position.getWidth(), position.getHeight());
}
@ -148,4 +183,18 @@ public class Image extends AbstractSemanticNode implements IEntity {
return (area / calculatedIntersection) > containmentThreshold;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public boolean isLeaf() {
return true;
}
}

View File

@ -0,0 +1,7 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
public enum LayoutEngine {
ALGORITHM,
AI,
OUTLINE
}

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.AccessLevel;
@ -49,4 +50,11 @@ public class Paragraph extends AbstractSemanticNode {
return getTreeId() + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -89,4 +91,11 @@ public class Section extends AbstractSemanticNode {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value));
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -20,7 +20,7 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?");
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?");
protected enum Format {

View File

@ -14,18 +14,17 @@ import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.ConsecutiveTextBlockCollector;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.IEntity;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.ConsecutiveTextBlockCollector;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import com.iqser.red.service.redaction.v1.server.service.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
import com.iqser.red.service.redaction.v1.server.utils.RedactionSearchUtility;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
public interface SemanticNode {
@ -819,16 +818,12 @@ public interface SemanticNode {
/**
* Accepts a {@link NodeVisitor} and initiates a depth-first traversal of the semantic tree rooted at this node.
* The visitor's {@link NodeVisitor#visit(SemanticNode)} method is invoked for each node encountered during the traversal.
* The visitor's {@link NodeVisitor#visit} method is invoked for each node encountered during the traversal.
*
* @param visitor The {@link NodeVisitor} to accept and apply during the traversal.
* @see NodeVisitor
*/
default void accept(NodeVisitor visitor) {
visitor.visit(this);
streamChildren().forEach(childNode -> childNode.accept(visitor));
}
void accept(NodeVisitor visitor);
/**

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -89,4 +91,11 @@ public class SuperSection extends AbstractSemanticNode {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value));
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -14,10 +14,10 @@ import java.util.stream.IntStream;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -419,4 +419,11 @@ public class Table implements SemanticNode {
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -4,6 +4,7 @@ import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.Map;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
@ -80,4 +81,11 @@ public class TableCell extends AbstractSemanticNode {
return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -0,0 +1,47 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class TableOfContents extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.TABLE_OF_CONTENTS;
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(() -> getParent().getHeadline());
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.TABLE_OF_CONTENTS_ITEM + ": " + getTextBlock().buildSummary();
}
}

View File

@ -0,0 +1,57 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import com.iqser.red.service.redaction.v1.server.model.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class TableOfContentsItem extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.TABLE_OF_CONTENTS_ITEM;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.TABLE_OF_CONTENTS_ITEM + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -1,6 +1,6 @@
package com.iqser.red.service.redaction.v1.server.model.document.textblock;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData;
import static com.iqser.red.service.redaction.v1.server.data.DocumentPositionDataProto.DocumentPositionData;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
@ -16,12 +16,12 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.data.DocumentPositionDataProto.DocumentPositionData.Position;
import com.iqser.red.service.redaction.v1.server.data.DocumentTextDataProto.DocumentTextData;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.DocumentPositionData.Position;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.DocumentTextData;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -29,6 +29,7 @@ import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NonNull;
import lombok.experimental.FieldDefaults;
@Data
@ -37,21 +38,33 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class AtomicTextBlock implements TextBlock {
@NonNull
Long id;
@NonNull
Integer numberOnPage;
@NonNull
Page page;
//string coordinates
@NonNull
TextRange textRange;
@NonNull
String searchText;
@NonNull
List<Integer> lineBreaks;
@NonNull
List<TextRange> italicTextRanges;
@NonNull
List<TextRange> boldTextRanges;
SoftReference<String> searchTextLowerCaseCache;
SoftReference<List<String>> wordsCache;
//position coordinates
@NonNull
List<Integer> stringIdxToPositionIdx;
@Getter
@NonNull
List<Rectangle2D> positions;
@EqualsAndHashCode.Exclude
@ -77,6 +90,8 @@ public class AtomicTextBlock implements TextBlock {
.stringIdxToPositionIdx(Collections.emptyList())
.positions(Collections.emptyList())
.parent(parent)
.boldTextRanges(Collections.emptyList())
.italicTextRanges(Collections.emptyList())
.build();
}
@ -92,6 +107,14 @@ public class AtomicTextBlock implements TextBlock {
.lineBreaks(atomicTextBlockData.getLineBreaksList())
.stringIdxToPositionIdx(atomicPositionBlockData.getStringIdxToPositionIdxList())
.positions(toRectangle2DList(atomicPositionBlockData.getPositionsList()))
.italicTextRanges(atomicTextBlockData.getItalicTextRangesList()
.stream()
.map(r -> new TextRange(r.getStart(), r.getEnd()))
.toList())
.boldTextRanges(atomicTextBlockData.getBoldTextRangesList()
.stream()
.map(r -> new TextRange(r.getStart(), r.getEnd()))
.toList())
.parent(parent)
.build();
}

View File

@ -18,6 +18,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import lombok.AccessLevel;
import lombok.Data;
import lombok.NonNull;
import lombok.experimental.FieldDefaults;
@Data
@ -162,6 +163,26 @@ public class ConcatenatedTextBlock implements TextBlock {
}
@Override
public List<TextRange> getItalicTextRanges() {
return getAtomicTextBlocks().stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getItalicTextRanges()
.stream())
.toList();
}
@Override
public List<TextRange> getBoldTextRanges() {
return getAtomicTextBlocks().stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getBoldTextRanges()
.stream())
.toList();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
@ -279,6 +300,7 @@ public class ConcatenatedTextBlock implements TextBlock {
}
@NonNull
@Override
public String toString() {

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.model.document;
package com.iqser.red.service.redaction.v1.server.model.document.textblock;
import java.util.LinkedList;
import java.util.List;
@ -10,9 +10,6 @@ import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.ConcatenatedTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.NoArgsConstructor;
@NoArgsConstructor

View File

@ -55,6 +55,12 @@ public interface TextBlock extends CharSequence {
String subSequenceWithLineBreaks(TextRange textRange);
List<TextRange> getItalicTextRanges();
List<TextRange> getBoldTextRanges();
int numberOfLines();

View File

@ -1,23 +1,27 @@
package com.iqser.red.service.redaction.v1.server.service.document;
package com.iqser.red.service.redaction.v1.server.utils;
import java.util.List;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.document.IntersectingNodeVisitor;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class EntityCreationUtility {
public static void checkIfBothStartAndEndAreEmpty(String start, String end) {
public void checkIfBothStartAndEndAreEmpty(String start, String end) {
checkIfBothStartAndEndAreEmpty(List.of(start), List.of(end));
}
public static <T> void checkIfBothStartAndEndAreEmpty(List<T> start, List<T> end) {
public <T> void checkIfBothStartAndEndAreEmpty(List<T> start, List<T> end) {
if ((start == null || start.isEmpty()) && (end == null || end.isEmpty())) {
throw new IllegalArgumentException("Start and end values are empty!");
@ -25,7 +29,7 @@ public class EntityCreationUtility {
}
public static int truncateEndIfLineBreakIsBetween(int end, int expandedEnd, TextBlock textBlock) {
public int truncateEndIfLineBreakIsBetween(int end, int expandedEnd, TextBlock textBlock) {
if (textBlock.getNextLinebreak(end) < expandedEnd) {
return end;
@ -34,7 +38,7 @@ public class EntityCreationUtility {
}
public static Set<SemanticNode> findIntersectingSubNodes(SemanticNode initialIntersectingNode, TextRange textRange) {
public Set<SemanticNode> findIntersectingSubNodes(SemanticNode initialIntersectingNode, TextRange textRange) {
IntersectingNodeVisitor visitor = new IntersectingNodeVisitor(textRange);
@ -46,7 +50,7 @@ public class EntityCreationUtility {
}
public static void addToPages(TextEntity entity) {
public void addToPages(TextEntity entity) {
Set<Page> pages = entity.getDeepestFullyContainingNode().getPages(entity.getTextRange());
entity.getPages().addAll(pages);
@ -54,14 +58,14 @@ public class EntityCreationUtility {
}
public static void addEntityToNodeEntitySets(TextEntity entity) {
public void addEntityToNodeEntitySets(TextEntity entity) {
entity.getIntersectingNodes()
.forEach(node -> node.getEntities().add(entity));
}
public static boolean allEntitiesIntersectAndHaveSameTypes(List<TextEntity> entitiesToMerge) {
public boolean allEntitiesIntersectAndHaveSameTypes(List<TextEntity> entitiesToMerge) {
if (entitiesToMerge.isEmpty()) {
return true;
@ -79,7 +83,7 @@ public class EntityCreationUtility {
}
public static TextRange toLineAfterTextRange(TextBlock textBlock, TextRange textRange) {
public TextRange toLineAfterTextRange(TextBlock textBlock, TextRange textRange) {
if (textBlock.getTextRange().end() == textRange.end()) {
return new TextRange(textRange.end(), textRange.end());

View File

@ -1,39 +1,36 @@
package com.iqser.red.service.redaction.v1.server.service.document;
package com.iqser.red.service.redaction.v1.server.utils;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.RedactionServiceSettings;
import lombok.RequiredArgsConstructor;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@Service
@RequiredArgsConstructor
@UtilityClass
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class EntityEnrichmentService {
private final RedactionServiceSettings redactionServiceSettings;
int SURROUNDING_WORDS_OFFSET_WINDOW = 100;
int NUMBER_OF_SURROUNDING_WORDS = 3;
public void enrichEntity(TextEntity entity, TextBlock textBlock) {
entity.setValue(textBlock.subSequence(entity.getTextRange()).toString());
entity.setTextAfter(findTextAfter(entity.getTextRange().end(), textBlock));
entity.setTextBefore(findTextBefore(entity.getTextRange().start(), textBlock));
}
private String findTextAfter(int index, TextBlock textBlock) {
int endOffset = Math.min(index + redactionServiceSettings.getSurroundingWordsOffsetWindow(), textBlock.getTextRange().end());
int endOffset = Math.min(index + SURROUNDING_WORDS_OFFSET_WINDOW, textBlock.getTextRange().end());
String textAfter = textBlock.subSequence(index, endOffset).toString();
if (!textAfter.isBlank()) {
List<String> wordsAfter = splitToWordsAndRemoveEmptyWords(textAfter);
int numberOfWordsAfter = Math.min(wordsAfter.size(), redactionServiceSettings.getNumberOfSurroundingWords());
int numberOfWordsAfter = Math.min(wordsAfter.size(), NUMBER_OF_SURROUNDING_WORDS);
if (!wordsAfter.isEmpty()) {
return concatWordsAfter(wordsAfter.subList(0, numberOfWordsAfter), textAfter.startsWith(" "));
}
@ -41,14 +38,12 @@ public class EntityEnrichmentService {
return "";
}
private String findTextBefore(int index, TextBlock textBlock) {
int offsetBefore = Math.max(index - redactionServiceSettings.getSurroundingWordsOffsetWindow(), textBlock.getTextRange().start());
int offsetBefore = Math.max(index - SURROUNDING_WORDS_OFFSET_WINDOW, textBlock.getTextRange().start());
String textBefore = textBlock.subSequence(offsetBefore, index).toString();
if (!textBefore.isBlank()) {
List<String> wordsBefore = splitToWordsAndRemoveEmptyWords(textBefore);
int numberOfWordsBefore = Math.min(wordsBefore.size(), redactionServiceSettings.getNumberOfSurroundingWords());
int numberOfWordsBefore = Math.min(wordsBefore.size(), NUMBER_OF_SURROUNDING_WORDS);
if (!wordsBefore.isEmpty()) {
return concatWordsBefore(wordsBefore.subList(wordsBefore.size() - numberOfWordsBefore, wordsBefore.size()), textBefore.endsWith(" "));
}
@ -56,36 +51,26 @@ public class EntityEnrichmentService {
return "";
}
private static List<String> splitToWordsAndRemoveEmptyWords(String textAfter) {
return Arrays.stream(textAfter.split(" "))
private List<String> splitToWordsAndRemoveEmptyWords(String text) {
return Arrays.stream(text.split(" "))
.filter(word -> !Objects.equals("", word))
.toList();
}
private static String concatWordsBefore(List<String> words, boolean endWithSpace) {
private String concatWordsBefore(List<String> words, boolean endWithSpace) {
StringBuilder sb = new StringBuilder();
for (String word : words) {
sb.append(word).append(" ");
}
String result = sb.toString().trim();
return endWithSpace ? result + " " : result;
}
private static String concatWordsAfter(List<String> words, boolean startWithSpace) {
private String concatWordsAfter(List<String> words, boolean startWithSpace) {
StringBuilder sb = new StringBuilder();
for (String word : words) {
sb.append(word).append(" ");
}
String result = sb.toString().trim();
return startWithSpace ? " " + result : result;
}

View File

@ -313,7 +313,7 @@ public class RedactionSearchUtility {
while (matcher.find()) {
boundaries.add(new TextRange(matcher.start(group) + textBlock.getTextRange().start(), matcher.end(group) + textBlock.getTextRange().start()));
}
}catch (StackOverflowError stackOverflowError){
} catch (StackOverflowError stackOverflowError) {
log.warn("Stackoverflow error for pattern {} in text: {}", pattern.pattern(), textBlock);
}
return boundaries;
@ -322,7 +322,6 @@ public class RedactionSearchUtility {
private static List<TextRange> getTextRangesByPatternWithLineBreaks(TextBlock textBlock, int group, Pattern pattern) {
String searchTextWithLineBreaks = textBlock.searchTextWithLineBreaks();
Matcher matcher = pattern.matcher(searchTextWithLineBreaks);
List<TextRange> boundaries = new LinkedList<>();
@ -330,7 +329,7 @@ public class RedactionSearchUtility {
while (matcher.find()) {
boundaries.add(new TextRange(matcher.start(group) + textBlock.getTextRange().start(), matcher.end(group) + textBlock.getTextRange().start()));
}
}catch (StackOverflowError stackOverflowError){
} catch (StackOverflowError stackOverflowError) {
log.warn("Stackoverflow error for pattern {} in text with linebreaks: {}", pattern.pattern(), searchTextWithLineBreaks);
}
return boundaries;

View File

@ -0,0 +1,25 @@
syntax = "proto3";
option java_outer_classname = "DocumentPageProto";
option java_package = "com.iqser.red.service.redaction.v1.server.data";
message AllDocumentPages {
repeated DocumentPage documentPages = 1;
}
message DocumentPage {
// The page number, starting with 1.
int32 number = 1;
// The page height in PDF user units.
int32 height = 2;
// The page width in PDF user units.
int32 width = 3;
// The page rotation as specified by the PDF.
int32 rotation = 4;
}

View File

@ -0,0 +1,28 @@
syntax = "proto3";
option java_outer_classname = "DocumentPositionDataProto";
option java_package = "com.iqser.red.service.redaction.v1.server.data";
message AllDocumentPositionData {
repeated DocumentPositionData documentPositionData = 1;
}
message DocumentPositionData {
// Identifier of the text block.
int64 id = 1;
// For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate.
// This is required due to the text and position coordinates not being equal.
repeated int32 stringIdxToPositionIdx = 2;
// The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block.
// The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner.
// In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.
repeated Position positions = 3;
// Definition of a BoundingBox that contains x, y, width, and height.
message Position {
repeated float value = 1;
}
}

View File

@ -0,0 +1,12 @@
syntax = "proto3";
option java_outer_classname = "DocumentStructureProto";
option java_package = "com.iqser.red.service.redaction.v1.server.data";
import "EntryData.proto";
message DocumentStructure {
// The root EntryData represents the Document.
EntryData root = 1;
}

View File

@ -0,0 +1,40 @@
syntax = "proto3";
import "Range.proto";
option java_outer_classname = "DocumentTextDataProto";
option java_package = "com.iqser.red.service.redaction.v1.server.data";
message AllDocumentTextData {
repeated DocumentTextData documentTextData = 1;
}
message DocumentTextData {
// Identifier of the text block.
int64 id = 1;
// The page the text block occurs on.
int64 page = 2;
// The text of the text block.
string searchText = 3;
// Each text block is assigned a number on a page, starting from 0.
int32 numberOnPage = 4;
// The text blocks are ordered, this number represents the start of the text block as a string offset.
int32 start = 5;
// The text blocks are ordered, this number represents the end of the text block as a string offset.
int32 end = 6;
// The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.
repeated int32 lineBreaks = 7;
// The text ranges where the text is italic
repeated Range italicTextRanges = 8;
// The text ranges where the text is bold
repeated Range boldTextRanges = 9;
}

View File

@ -0,0 +1,30 @@
syntax = "proto3";
import "LayoutEngine.proto";
import "NodeType.proto";
option java_outer_classname = "EntryDataProto";
option java_package = "com.iqser.red.service.redaction.v1.server.data";
message EntryData {
// Type of the semantic node.
NodeType type = 1;
// Specifies the position in the parsed tree structure.
repeated int32 treeId = 2;
// Specifies the text block IDs associated with this semantic node.
repeated int64 atomicBlockIds = 3;
// Specifies the pages this semantic node appears on.
repeated int64 pageNumbers = 4;
// Some semantic nodes have additional information, this information is stored in this Map.
map<string, string> properties = 5;
// All child Entries of this Entry.
repeated EntryData children = 6;
// Describes the origin of the semantic node.
repeated LayoutEngine engines = 7;
}

View File

@ -0,0 +1,10 @@
syntax = "proto3";
option java_outer_classname = "LayoutEngineProto";
option java_package = "com.iqser.red.service.redaction.v1.server.data";
enum LayoutEngine {
ALGORITHM = 0;
AI = 1;
OUTLINE = 2;
}

View File

@ -0,0 +1,19 @@
syntax = "proto3";
option java_outer_classname = "NodeTypeProto";
option java_package = "com.iqser.red.service.redaction.v1.server.data";
enum NodeType {
DOCUMENT = 0;
SECTION = 1;
SUPER_SECTION = 2;
HEADLINE = 3;
PARAGRAPH = 4;
TABLE = 5;
TABLE_CELL = 6;
IMAGE = 7;
HEADER = 8;
FOOTER = 9;
TABLE_OF_CONTENTS = 10;
TABLE_OF_CONTENTS_ITEM = 11;
}

View File

@ -0,0 +1,14 @@
syntax = "proto3";
option java_outer_classname = "RangeProto";
option java_package = "com.iqser.red.service.redaction.v1.server.data";
message Range {
// A start index.
int32 start = 1;
// An end index.
int32 end = 2;
}

View File

@ -0,0 +1,26 @@
#!/bin/bash
# Minimum required protoc version
MIN_VERSION="28.3"
# Get the installed protoc version
INSTALLED_VERSION=$(protoc --version | awk '{print $2}')
# Function to compare versions
version_lt() {
[ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" != "$1" ]
}
# Check if protoc is installed and meets the minimum version
if ! command -v protoc &> /dev/null; then
echo "Error: protoc is not installed. Please install version $MIN_VERSION or later."
exit 1
fi
if version_lt "$INSTALLED_VERSION" "$MIN_VERSION"; then
echo "Error: protoc version $INSTALLED_VERSION is too old. Please upgrade to version $MIN_VERSION or later."
exit 1
fi
# Generate Java files from proto files
protoc --java_out=../java ./*.proto

View File

@ -0,0 +1,33 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.Test;
import com.iqser.red.service.redaction.v1.server.data.LayoutEngineProto;
public class LayoutEngineMappingTest {
@Test
public void assertAllValuesMatch() {
for (LayoutEngine value : LayoutEngine.values()) {
var engine = LayoutEngineProto.LayoutEngine.valueOf(value.name());
assertEquals(engine.name(), value.name());
}
}
@Test
public void assertAllValuesMatchReverse() {
for (LayoutEngineProto.LayoutEngine value : LayoutEngineProto.LayoutEngine.values()) {
if (value.equals(LayoutEngineProto.LayoutEngine.UNRECOGNIZED)) {
continue;
}
var engine = LayoutEngine.valueOf(value.name());
assertEquals(engine.name(), value.name());
}
}
}

View File

@ -0,0 +1,33 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.Test;
import com.iqser.red.service.redaction.v1.server.data.NodeTypeProto;
public class NodeTypeMappingTest {
@Test
public void assertAllValuesMatch() {
for (NodeType value : NodeType.values()) {
var engine = NodeTypeProto.NodeType.valueOf(value.name());
assertEquals(engine.name(), value.name());
}
}
@Test
public void assertAllValuesMatchReverse() {
for (NodeTypeProto.NodeType value : NodeTypeProto.NodeType.values()) {
if (value.equals(NodeTypeProto.NodeType.UNRECOGNIZED)) {
continue;
}
var engine = NodeType.valueOf(value.name());
assertEquals(engine.name(), value.name());
}
}
}

View File

@ -0,0 +1,144 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.List;
import org.junit.jupiter.api.Test;
class SectionIdentifierTest {
@Test
void testSectionIdentifier() {
SectionIdentifier identifier = SectionIdentifier.fromSearchText("1.1.2: Headline");
assertEquals(SectionIdentifier.Format.NUMERICAL, identifier.getFormat());
assertEquals(3, identifier.level());
assertEquals(List.of(1, 1, 2), identifier.getIdentifiers());
SectionIdentifier child = SectionIdentifier.asChildOf(identifier);
assertTrue(child.isChildOf(identifier));
SectionIdentifier parent = SectionIdentifier.fromSearchText("1.1: Headline");
assertTrue(parent.isParentOf(identifier));
}
@Test
void testSectionIdentifier2() {
SectionIdentifier identifier = SectionIdentifier.fromSearchText("A.1.2: Headline");
assertEquals(SectionIdentifier.Format.ALPHANUMERIC, identifier.getFormat());
assertEquals(3, identifier.level());
assertEquals(List.of(1, 1, 2), identifier.getIdentifiers());
}
@Test
void testSectionIdentifier3() {
SectionIdentifier identifier = SectionIdentifier.fromSearchText("D.1.2: Headline");
assertEquals(SectionIdentifier.Format.ALPHANUMERIC, identifier.getFormat());
assertEquals(3, identifier.level());
assertEquals(List.of(4, 1, 2), identifier.getIdentifiers());
}
@Test
void testSectionIdentifier4() {
SectionIdentifier identifier = SectionIdentifier.fromSearchText("4.1.2.4: Headline");
assertEquals(SectionIdentifier.Format.NUMERICAL, identifier.getFormat());
assertEquals(4, identifier.level());
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
}
@Test
void testSectionIdentifier5() {
SectionIdentifier identifier = SectionIdentifier.fromSearchText("D.1.2.4.5: Headline");
assertEquals(SectionIdentifier.Format.ALPHANUMERIC, identifier.getFormat());
assertEquals(4, identifier.level());
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
}
@Test
void testSectionIdentifier6() {
SectionIdentifier identifier = SectionIdentifier.fromSearchText("d.1.2.4.5: Headline");
assertEquals(SectionIdentifier.Format.ALPHANUMERIC, identifier.getFormat());
assertEquals(4, identifier.level());
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
}
@Test
void testSectionIdentifier7() {
SectionIdentifier identifier = SectionIdentifier.fromSearchText("4.1.2.4.5: Headline");
assertEquals(SectionIdentifier.Format.NUMERICAL, identifier.getFormat());
assertEquals(4, identifier.level());
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
}
@Test
void testFalsePositive111() {
SectionIdentifier identifier = SectionIdentifier.fromSearchText("111: Headline");
assertEquals(SectionIdentifier.Format.NUMERICAL, identifier.getFormat());
assertEquals(1, identifier.level());
}
@Test
public void testParentOf() {
var headline = SectionIdentifier.fromSearchText("1 Did you ever hear the tragedy of Darth Plagueis The Wise?");
var headline1 = SectionIdentifier.fromSearchText("1.0 I thought not. Its not a story the Jedi would tell you.");
var headline2 = SectionIdentifier.fromSearchText("1.1 Its a Sith legend. Darth Plagueis was a Dark Lord of the Sith, ");
var headline3 = SectionIdentifier.fromSearchText("1.2.3 so powerful and so wise he could use the Force to influence the midichlorians to create life…");
var headline4 = SectionIdentifier.fromSearchText("1.2.3.4 He had such a knowledge of the dark side that he could even keep the ones he cared about from dying.");
var headline5 = SectionIdentifier.fromSearchText("1.2.3.4.5 The dark side of the Force is a pathway to many abilities some consider to be unnatural.");
var headline6 = SectionIdentifier.fromSearchText("2.0 He became so powerful…");
var headline7 = SectionIdentifier.fromSearchText("10000.0 the only thing he was afraid of was losing his power,");
var headline8 = SectionIdentifier.fromSearchText("A.0 which eventually, of course, he did.");
var headline9 = SectionIdentifier.fromSearchText("Unfortunately, he taught his apprentice everything he knew, then his apprentice killed him in his sleep.");
var headline10 = SectionIdentifier.fromSearchText("2.1.2 Ironic.");
var headline11 = SectionIdentifier.fromSearchText("2.He could save others from death,");
var headline12 = SectionIdentifier.fromSearchText(" 2. but not himself.");
var paragraph1 = SectionIdentifier.asChildOf(headline);
assertTrue(paragraph1.isChildOf(headline));
assertTrue(headline.isParentOf(paragraph1));
assertFalse(paragraph1.isParentOf(headline));
assertFalse(headline.isParentOf(headline1));
assertTrue(headline.isParentOf(headline2));
assertTrue(headline.isParentOf(headline3));
assertTrue(headline.isParentOf(headline4));
assertTrue(headline.isParentOf(headline5));
assertTrue(headline1.isParentOf(headline2));
assertFalse(headline1.isParentOf(headline1));
assertTrue(headline3.isParentOf(headline4));
assertFalse(headline4.isParentOf(headline5));
assertFalse(headline2.isParentOf(headline3));
assertFalse(headline2.isParentOf(headline4));
assertTrue(headline1.isParentOf(headline3));
assertTrue(headline1.isParentOf(headline4));
assertFalse(headline1.isParentOf(headline6));
assertFalse(headline1.isParentOf(headline7));
assertFalse(headline8.isParentOf(headline1));
assertFalse(headline8.isParentOf(headline2));
assertFalse(headline8.isParentOf(headline3));
assertFalse(headline8.isParentOf(headline4));
assertFalse(headline9.isParentOf(headline9));
assertTrue(headline10.isChildOf(headline11));
assertTrue(headline10.isChildOf(headline12));
}
}

View File

@ -4,7 +4,7 @@ plugins {
}
description = "redaction-service-api-v1"
val persistenceServiceVersion = "2.587.0"
val persistenceServiceVersion = "2.631.0"
dependencies {
implementation("org.springframework:spring-web:6.0.12")

View File

@ -12,12 +12,12 @@ plugins {
description = "redaction-service-server-v1"
val layoutParserVersion = "0.181.0"
val layoutParserVersion = "0.193.0"
val jacksonVersion = "2.15.2"
val droolsVersion = "9.44.0.Final"
val pdfBoxVersion = "3.0.0"
val persistenceServiceVersion = "2.592.0-RED10260.0"
val llmServiceVersion = "1.11.0"
val persistenceServiceVersion = "2.641.0"
val llmServiceVersion = "1.20.0-RED10072.2"
val springBootStarterVersion = "3.1.5"
val springCloudVersion = "4.0.4"
val testContainersVersion = "1.19.7"
@ -31,15 +31,10 @@ configurations {
}
}
configurations.all {
resolutionStrategy {
force("com.google.protobuf:protobuf-java:4.27.1")
}
}
dependencies {
implementation(project(":redaction-service-api-v1")) { exclude(group = "com.iqser.red.service", module = "persistence-service-internal-api-v1") }
implementation(project(":document"))
implementation("com.iqser.red.service:persistence-service-internal-api-v1:${persistenceServiceVersion}") { exclude(group = "org.springframework.boot") }
implementation("com.iqser.red.service:persistence-service-shared-mongo-v1:${persistenceServiceVersion}")
{
@ -61,13 +56,14 @@ dependencies {
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.ahocorasick:ahocorasick:0.6.3")
implementation("org.ahocorasick:ahocorasick:0.9.0")
implementation("com.hankcs:aho-corasick-double-array-trie:1.2.2")
implementation("com.github.roklenarcic:aho-corasick:1.2")
implementation("org.javassist:javassist:3.29.2-GA")
implementation("org.drools:drools-engine:${droolsVersion}")
implementation("org.drools:drools-mvel:${droolsVersion}")
implementation("org.kie:kie-spring:7.74.1.Final")
implementation("com.google.protobuf:protobuf-java:4.27.1")
implementation("org.locationtech.jts:jts-core:1.19.0")
@ -81,6 +77,9 @@ dependencies {
implementation("org.apache.tomcat:tomcat-websocket:${tomcatVersion}")
implementation("org.apache.tomcat.embed:tomcat-embed-core:${tomcatVersion}")
implementation("org.liquibase:liquibase-core:4.29.2") // Needed to be set explicit, otherwise spring dependency management sets it to 4.20.0
implementation("org.liquibase.ext:liquibase-mongodb:4.29.2")
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
api("ch.qos.logback:logback-classic")
@ -103,6 +102,10 @@ dependencies {
group = "com.iqser.red.service",
module = "persistence-service-shared-api-v1"
)
exclude(
group = "com.knecon.fforesight",
module = "document"
)
}
testImplementation("com.pdftron:PDFNet:10.11.0")
}
@ -184,13 +187,19 @@ tasks.register("generateJavaDoc", Javadoc::class) {
dependsOn("compileJava")
dependsOn("delombok")
classpath = project.sourceSets["main"].runtimeClasspath
source = fileTree("${buildDir}/generated/sources/delombok/java/main") {
val documentFiles = fileTree("${project(":document").layout.buildDirectory.get()}/generated/sources/delombok/java/main") {
include(droolsImports)
}
destinationDir = file(project.findProperty("javadocDestinationDir")?.toString() ?: "")
val mainFiles = fileTree("${layout.buildDirectory.get()}/generated/sources/delombok/java/main") {
include(droolsImports)
}
source = documentFiles + mainFiles
setDestinationDir(file(project.findProperty("javadocDestinationDir")?.toString() ?: ""))
options.memberLevel = JavadocMemberLevel.PUBLIC
(options as StandardJavadocDocletOptions).apply {
title = "API Documentation for Redaction Service ${project.version}"
}
}

View File

@ -10,11 +10,13 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.reflections.Reflections;
import org.reflections.scanners.Scanners;
import org.reflections.util.ConfigurationBuilder;
import org.reflections.util.FilterBuilder;
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
@ -25,6 +27,8 @@ import lombok.extern.slf4j.Slf4j;
public class DeprecatedElementsFinder {
public static final String PACKAGE_NAME = "com.iqser.red.service.redaction.v1.server";
public static final Pattern DATA_PACKAGE = Pattern.compile(".*/data/.*");
private Set<Method> deprecatedMethods;
@Getter
private Map<String, String> deprecatedMethodsSignaturesMap;
@ -43,7 +47,10 @@ public class DeprecatedElementsFinder {
Reflections reflections = new Reflections(new ConfigurationBuilder().forPackage(PACKAGE_NAME)
.setExpandSuperTypes(true)
.setScanners(Scanners.MethodsAnnotated, Scanners.TypesAnnotated, Scanners.SubTypes));
.setScanners(Scanners.MethodsAnnotated, Scanners.TypesAnnotated, Scanners.SubTypes)
.filterInputsBy(new FilterBuilder().includePackage(PACKAGE_NAME).excludePackage(PACKAGE_NAME + ".data")
// Exclude the generated proto data package
));
deprecatedMethods = reflections.get(Scanners.MethodsAnnotated.with(Deprecated.class).as(Method.class));

View File

@ -28,6 +28,8 @@ public class RedactionServiceSettings {
private boolean priorityMode;
private long firstLevelDictionaryCacheMaximumSize = 1000;
private long dictionaryCacheMaximumSize = 100;
private int dictionaryCacheExpireAfterAccessDays = 3;

View File

@ -82,4 +82,14 @@ public class NerEntities {
LLM_NER
}
public static com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Engine mapToPrimaryEngine(NerEntities.Engine nerEntityEngine) {
return switch (nerEntityEngine) {
case NER, CLOUD_NER -> com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Engine.NER;
case LLM_NER -> com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Engine.LLM_NER;
};
}
}

View File

@ -2,6 +2,7 @@ package com.iqser.red.service.redaction.v1.server.model;
import static com.iqser.red.service.redaction.v1.server.service.NotFoundImportedEntitiesService.IMPORTED_REDACTION_TYPE;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.PriorityQueue;
@ -15,10 +16,12 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualRedactionEntry;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualResizeRedaction;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityEventListener;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.document.entity.IEntity;
import com.iqser.red.service.redaction.v1.server.model.document.entity.ManualChangeOverwrite;
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
import com.iqser.red.service.redaction.v1.server.model.document.entity.RectangleWithPage;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -51,7 +54,8 @@ public class PrecursorEntity implements IEntity {
@Builder.Default
PriorityQueue<MatchedRule> matchedRuleList = new PriorityQueue<>();
ManualChangeOverwrite manualOverwrite;
@Builder.Default
ManualChangeOverwrite manualOverwrite = new ManualChangeOverwrite();
public static PrecursorEntity fromManualRedactionEntry(ManualRedactionEntry manualRedactionEntry, boolean hint) {
@ -125,6 +129,7 @@ public class PrecursorEntity implements IEntity {
.id(importedRedaction.getId())
.value(value)
.entityPosition(rectangleWithPages)
.ruleIdentifier("IMP.0.0")
.reason(Optional.ofNullable(importedRedaction.getReason())
.orElse(""))
.legalBasis(Optional.ofNullable(importedRedaction.getLegalBasis())
@ -196,6 +201,28 @@ public class PrecursorEntity implements IEntity {
}
@Override
public void addEntityEventListener(EntityEventListener listener) {
throw new UnsupportedOperationException("PrecursorEntity does not support entityEventListeners");
}
@Override
public void removeEntityEventListener(EntityEventListener listener) {
throw new UnsupportedOperationException("PrecursorEntity does not support entityEventListeners");
}
@Override
public Collection<EntityEventListener> getEntityEventListeners() {
throw new UnsupportedOperationException("PrecursorEntity does not support entityEventListeners");
}
private static EntityType getEntityType(EntryType entryType) {
switch (entryType) {

View File

@ -3,7 +3,7 @@ package com.iqser.red.service.redaction.v1.server.model.component;
import java.util.Collection;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.model.drools.RuleIdentifier;
import com.iqser.red.service.redaction.v1.server.model.document.entity.RuleIdentifier;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;

View File

@ -0,0 +1,130 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.*;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
public abstract class AbstractDictionarySearch implements DictionarySearch {
protected final Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap;
public AbstractDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
this.keyWordToIdentifiersMap = keyWordToIdentifiersMap;
}
@Override
public Stream<MatchTextRange> getBoundaries(CharSequence text) {
TextContext textContext = new TextContext(text);
return getMatchTextRangeStream(textContext);
}
@Override
public Stream<MatchTextRange> getBoundaries(CharSequence text, TextRange region) {
CharSequence subText = text.subSequence(region.start(), region.end());
TextContext textContext = new TextContext(subText, region.start());
return getMatchTextRangeStream(textContext);
}
@Override
public Stream<MatchTextRange> getBoundaries(TextBlock textBlock) {
return getBoundaries(textBlock, textBlock.getTextRange());
}
@Override
public Stream<MatchPosition> getMatches(String text) {
TextContext textContext = new TextContext(text);
List<MatchPosition> matches = new ArrayList<>();
parseText(textContext.getLowerText(), (begin, end, value) -> addMatchPositionsForHit(textContext, matches, new Hit(begin, end, value)));
return matches.stream();
}
private Stream<MatchTextRange> getMatchTextRangeStream(TextContext textContext) {
List<MatchTextRange> matches = new ArrayList<>();
parseText(textContext.getLowerText(), (begin, end, value) -> addMatchesForHit(textContext, matches, new Hit(begin, end, value)));
return matches.stream();
}
protected abstract void parseText(CharSequence text, HitHandler handler);
protected void addMatchesForHit(TextContext textContext, List<MatchTextRange> matches, Hit hit) {
int start = textContext.getStart(hit.begin);
int end = textContext.getEnd(hit.end);
String matchedText = textContext.getMatchedText(hit.begin, hit.end);
List<DictionaryIdentifierWithKeyword> idWithKeywords = hit.value;
for (DictionaryIdentifierWithKeyword idkw : idWithKeywords) {
if (idkw.identifier().caseSensitive()) {
if (matchedText.equals(idkw.keyword())) {
matches.add(new MatchTextRange(idkw.identifier(), new TextRange(start, end)));
}
} else {
matches.add(new MatchTextRange(idkw.identifier(), new TextRange(start, end)));
}
}
}
protected void addMatchPositionsForHit(TextContext textContext, List<MatchPosition> matches, Hit hit) {
int start = textContext.getStart(hit.begin);
int end = textContext.getEnd(hit.end);
String matchedText = textContext.getMatchedText(hit.begin, hit.end);
List<DictionaryIdentifierWithKeyword> idWithKeywords = hit.value;
for (DictionaryIdentifierWithKeyword idkw : idWithKeywords) {
MatchPosition matchPosition = new MatchPosition(idkw.identifier(), start, end);
if (idkw.identifier().caseSensitive()) {
if (matchedText.equals(idkw.keyword())) {
matches.add(matchPosition);
}
} else {
matches.add(matchPosition);
}
}
}
protected interface HitHandler {
void handle(int begin, int end, List<DictionaryIdentifierWithKeyword> value);
}
protected static class Hit {
final int begin;
final int end;
final List<DictionaryIdentifierWithKeyword> value;
Hit(int begin, int end, List<DictionaryIdentifierWithKeyword> value) {
this.begin = begin;
this.end = end;
this.value = value;
}
}
}

View File

@ -0,0 +1,32 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.List;
import java.util.Map;
import com.roklenarcic.util.strings.AhoCorasickMap;
import com.roklenarcic.util.strings.MapMatchListener;
import com.roklenarcic.util.strings.StringMap;
public class AhoCorasickMapDictionarySearch extends AbstractDictionarySearch {
private final StringMap<List<DictionaryIdentifierWithKeyword>> map;
public AhoCorasickMapDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
super(keyWordToIdentifiersMap);
map = new AhoCorasickMap<>(keyWordToIdentifiersMap.keySet(), keyWordToIdentifiersMap.values(), false);
}
@Override
protected void parseText(CharSequence text, HitHandler handler) {
MapMatchListener<List<DictionaryIdentifierWithKeyword>> listener = (haystack, startPosition, endPosition, value) -> {
handler.handle(startPosition, endPosition, value);
return true;
};
map.match(text.toString(), listener);
}
}

View File

@ -2,20 +2,16 @@ package com.iqser.red.service.redaction.v1.server.model.dictionary;
import static java.lang.String.format;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
import com.iqser.red.service.redaction.v1.server.model.document.entity.Relation;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.utils.Patterns;
import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException;
@ -29,31 +25,77 @@ import lombok.Getter;
@Data
public class Dictionary {
@Getter
private List<DictionaryModel> dictionaryModels;
// todo: dossier and dossier template level DictionaryModels override each other
// at the moment there are no problems because they always have the same rank / hint information
// but it should be changed so that the localAccessMap contains all models
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
private final Map<String, Map<Level, DictionaryModel>> localAccessMap = new HashMap<>();
@Getter
private DictionaryVersion version;
private final DictionaryVersion version;
private final DictionarySearch dictionarySearch;
public enum Level {
DOSSIER_TEMPLATE,
DOSSIER
}
public Dictionary(List<DictionaryModel> dictionaryModels, DictionaryVersion version) {
Dictionary(List<DictionaryModel> dictionaryModels, DictionaryVersion version, DictionarySearch dictionarySearch) {
this.dictionaryModels = dictionaryModels;
this.dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), dm));
dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), Map.of(getLevel(dm.isDossierDictionary()), dm)));
this.version = version;
this.dictionarySearch = dictionarySearch;
}
public boolean containsType(String type) {
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
return !(levelMap == null || levelMap.isEmpty());
}
private Level getLevel(boolean isDossierDictionary) {
return isDossierDictionary ? Level.DOSSIER : Level.DOSSIER_TEMPLATE;
}
/**
* Determines the default level for a given type based on the levels present.
* If both levels are present, it defaults to {@code Level.DOSSIER}.
*
* @param type The type to determine the default level for.
* @return The default {@link Level} for the specified type.
* @throws NotFoundException If the type is not found in the dictionary.
*/
private Level getDefaultLevel(String type) {
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
if (levelMap == null || levelMap.isEmpty()) {
throw new NotFoundException("Type: " + type + " is not found");
}
if (levelMap.containsKey(Level.DOSSIER)) {
return Level.DOSSIER;
} else {
// Use whatever level is present
return levelMap.keySet()
.iterator().next();
}
}
public int getDictionaryRank(String type, Level level) {
if (!localAccessMap.containsKey(type)) {
return 0;
}
DictionaryModel model = localAccessMap.get(type)
.get(level);
return model != null ? model.getRank() : 0;
}
public int getDictionaryRank(String type) {
if (!localAccessMap.containsKey(type)) {
return 0;
}
return localAccessMap.get(type).getRank();
return getDictionaryRank(type, getDefaultLevel(type));
}
@ -64,11 +106,21 @@ public class Dictionary {
*/
public boolean hasLocalEntries() {
return dictionaryModels.stream()
return getDictionaryModels().stream()
.anyMatch(dm -> !dm.getLocalEntriesWithMatchedRules().isEmpty());
}
public List<DictionaryModel> getDictionaryModels() {
return localAccessMap.values()
.stream()
.flatMap(levelDictionaryModelMap -> levelDictionaryModelMap.values()
.stream())
.toList();
}
public Set<String> getTypes() {
return localAccessMap.keySet();
@ -76,56 +128,144 @@ public class Dictionary {
/**
* Retrieves the {@link DictionaryModel} of a specified type.
* Retrieves the {@link DictionaryModel} of a specified type and level.
*
* @param type The type of dictionary model to retrieve.
* @return The {@link DictionaryModel} of the specified type.
* @throws NotFoundException If the specified type is not found in the dictionary.
* @param type The type of dictionary model to retrieve.
* @param level The level of the dictionary model to retrieve.
* @return The {@link DictionaryModel} of the specified type and level.
* @throws NotFoundException If the specified type or level is not found in the dictionary.
*/
public DictionaryModel getType(String type) {
public DictionaryModel getType(String type, Level level) {
DictionaryModel model = localAccessMap.get(type);
if (model == null) {
throw new NotFoundException("Type: " + type + " is not found");
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
if (levelMap == null || !levelMap.containsKey(level)) {
throw new NotFoundException("Type: " + type + " with level: " + level + " is not found");
}
return model;
return levelMap.get(level);
}
/**
* Checks if the dictionary of a specific type is considered a hint.
* Retrieves the {@link DictionaryModel} of a specified type at the default level.
*
* @param type The type of dictionary model to retrieve.
* @return The {@link DictionaryModel} of the specified type at the default level.
* @throws NotFoundException If the specified type is not found in the dictionary.
*/
public DictionaryModel getType(String type) {
return getType(type, getDefaultLevel(type));
}
/**
* Checks if the dictionary of a specific type and level is considered a hint.
*
* @param type The type of dictionary to check.
* @param level The level of the dictionary to check.
* @return true if the dictionary model is marked as a hint, false otherwise.
*/
public boolean isHint(String type, Level level) {
DictionaryModel model = localAccessMap.get(type)
.get(level);
return model != null && model.isHint();
}
/**
* Checks if the dictionary of a specific type is considered a hint at the default level.
*
* @param type The type of dictionary to check.
* @return true if the dictionary model is marked as a hint, false otherwise.
*/
public boolean isHint(String type) {
DictionaryModel model = localAccessMap.get(type);
if (model != null) {
return model.isHint();
}
return false;
return isHint(type, getDefaultLevel(type));
}
/**
* Checks if the dictionary of a specific type is case-insensitive.
* Checks if the dictionary of a specific type and level is case-insensitive.
*
* @param type The type of dictionary to check.
* @param level The level of the dictionary to check.
* @return true if the dictionary is case-insensitive, false otherwise.
*/
public boolean isCaseInsensitiveDictionary(String type, Level level) {
DictionaryModel dictionaryModel = localAccessMap.get(type)
.get(level);
return dictionaryModel != null && dictionaryModel.isCaseInsensitive();
}
/**
* Checks if the dictionary of a specific type is case-insensitive at the default level.
*
* @param type The type of dictionary to check.
* @return true if the dictionary is case-insensitive, false otherwise.
*/
public boolean isCaseInsensitiveDictionary(String type) {
DictionaryModel dictionaryModel = localAccessMap.get(type);
if (dictionaryModel != null) {
return dictionaryModel.isCaseInsensitive();
}
return false;
return isCaseInsensitiveDictionary(type, getDefaultLevel(type));
}
/**
* Adds a local dictionary entry of a specific type.
* Adds a local dictionary entry of a specific type and level.
*
* @param type The type of dictionary to add the entry to.
* @param value The value of the entry.
* @param matchedRules A collection of {@link MatchedRule} associated with the entry.
* @param alsoAddLastname Indicates whether to also add the lastname separately as an entry.
* @param level The level of the dictionary where the entry should be added.
* @throws IllegalArgumentException If the specified type does not exist within the dictionary, if the type
* does not have any local entries defined, or if the provided value is
* blank. This ensures that only valid, non-empty entries
* are added to the dictionary.
*/
private void addLocalDictionaryEntry(String type, String value, Collection<MatchedRule> matchedRules, boolean alsoAddLastname, Level level) {
if (value.isBlank()) {
return;
}
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
if (levelMap == null || !levelMap.containsKey(level)) {
throw new IllegalArgumentException(format("DictionaryModel of type %s with level %s does not exist", type, level));
}
DictionaryModel dictionaryModel = levelMap.get(level);
if (dictionaryModel.getLocalEntriesWithMatchedRules() == null) {
throw new IllegalArgumentException(format("DictionaryModel of type %s has no local Entries", type));
}
if (StringUtils.isEmpty(value)) {
throw new IllegalArgumentException(format("%s is not a valid dictionary entry", value));
}
boolean isCaseInsensitive = dictionaryModel.isCaseInsensitive();
Set<MatchedRule> matchedRulesSet = new HashSet<>(matchedRules);
String cleanedValue = value;
if (isCaseInsensitive) {
cleanedValue = cleanedValue.toLowerCase(Locale.US);
}
dictionaryModel.getLocalEntriesWithMatchedRules()
.merge(cleanedValue.trim(),
matchedRulesSet,
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
.collect(Collectors.toSet()));
if (alsoAddLastname) {
String lastname = cleanedValue.split(" ")[0];
dictionaryModel.getLocalEntriesWithMatchedRules()
.merge(lastname,
matchedRulesSet,
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
.collect(Collectors.toSet()));
}
}
/**
* Adds a local dictionary entry of a specific type at the default level.
*
* @param type The type of dictionary to add the entry to.
* @param value The value of the entry.
@ -138,40 +278,7 @@ public class Dictionary {
*/
private void addLocalDictionaryEntry(String type, String value, Collection<MatchedRule> matchedRules, boolean alsoAddLastname) {
if (value.isBlank()) {
return;
}
if (localAccessMap.get(type) == null) {
throw new IllegalArgumentException(format("DictionaryModel of type %s does not exist", type));
}
if (localAccessMap.get(type).getLocalEntriesWithMatchedRules() == null) {
throw new IllegalArgumentException(format("DictionaryModel of type %s has no local Entries", type));
}
if (StringUtils.isEmpty(value)) {
throw new IllegalArgumentException(format("%s is not a valid dictionary entry", value));
}
boolean isCaseInsensitive = localAccessMap.get(type).isCaseInsensitive();
Set<MatchedRule> matchedRulesSet = new HashSet<>(matchedRules);
String cleanedValue = value;
if (isCaseInsensitive) {
cleanedValue = cleanedValue.toLowerCase(Locale.US);
}
localAccessMap.get(type)
.getLocalEntriesWithMatchedRules()
.merge(cleanedValue.trim(),
matchedRulesSet,
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
.collect(Collectors.toSet()));
if (alsoAddLastname) {
String lastname = cleanedValue.split(" ")[0];
localAccessMap.get(type)
.getLocalEntriesWithMatchedRules()
.merge(lastname,
matchedRulesSet,
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
.collect(Collectors.toSet()));
}
addLocalDictionaryEntry(type, value, matchedRules, alsoAddLastname, getDefaultLevel(type));
}
@ -179,10 +286,22 @@ public class Dictionary {
* Recommends a text entity for inclusion in every dictionary model without separating the last name.
*
* @param textEntity The {@link TextEntity} to be recommended.
* @param level The level of the dictionary where the recommendation should be added.
*/
public void recommendEverywhere(TextEntity textEntity, Level level) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), false, level);
}
/**
* Recommends a text entity for inclusion in every dictionary model without separating the last name at the default level.
*
* @param textEntity The {@link TextEntity} to be recommended.
*/
public void recommendEverywhere(TextEntity textEntity) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), false);
recommendEverywhere(textEntity, getDefaultLevel(textEntity.type()));
}
@ -190,10 +309,22 @@ public class Dictionary {
* Recommends a text entity for inclusion in every dictionary model with the last name added separately.
*
* @param textEntity The {@link TextEntity} to be recommended.
* @param level The level of the dictionary where the recommendation should be added.
*/
public void recommendEverywhereWithLastNameSeparately(TextEntity textEntity, Level level) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), true, level);
}
/**
* Recommends a text entity for inclusion in every dictionary model with the last name added separately at the default level.
*
* @param textEntity The {@link TextEntity} to be recommended.
*/
public void recommendEverywhereWithLastNameSeparately(TextEntity textEntity) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), true);
recommendEverywhereWithLastNameSeparately(textEntity, getDefaultLevel(textEntity.type()));
}
@ -201,11 +332,22 @@ public class Dictionary {
* Adds multiple author names contained within a text entity as recommendations in the dictionary.
*
* @param textEntity The {@link TextEntity} containing author names to be added.
* @param level The level of the dictionary where the recommendations should be added.
*/
public void addMultipleAuthorsAsRecommendation(TextEntity textEntity, Level level) {
splitIntoAuthorNames(textEntity).forEach(authorName -> addLocalDictionaryEntry(textEntity.type(), authorName, textEntity.getMatchedRuleList(), true, level));
}
/**
* Adds multiple author names contained within a text entity as recommendations in the dictionary at the default level.
*
* @param textEntity The {@link TextEntity} containing author names to be added.
*/
public void addMultipleAuthorsAsRecommendation(TextEntity textEntity) {
splitIntoAuthorNames(textEntity).forEach(authorName -> addLocalDictionaryEntry(textEntity.type(), authorName, textEntity.getMatchedRuleList(), true));
addMultipleAuthorsAsRecommendation(textEntity, getDefaultLevel(textEntity.type()));
}

View File

@ -0,0 +1,90 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@Service
@RequiredArgsConstructor
public class DictionaryFactory {
@SneakyThrows
public Dictionary create(List<DictionaryModel> dictionaryModels, DictionaryVersion dictionaryVersion) {
Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap = computeStringIdentifiersMap(dictionaryModels);
DictionarySearch dictionarySearch = getDictionarySearch(keyWordToIdentifiersMap);
return new Dictionary(dictionaryModels, dictionaryVersion, dictionarySearch);
}
private static DictionarySearch getDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
// a more sophisticated selection of the dictionarySearch could be done here
// but as we do not have the need to fine-tune at the moment we use the all-rounder solution, which is the AhoCoraSickMapDictionarySearch
// based on this repository https://github.com/RokLenarcic/AhoCorasick
// This is an outline how a more complex dictionarySearch decision could be made:
// if (!redactionServiceSettings.isPriorityMode() && keyWordToIdentifiersMap.keySet().size() < 50_000) {
// dictionarySearch = new DoubleArrayTrieDictionarySearch(keyWordToIdentifiersMap);
// } else {
// dictionarySearch = new AhoCorasickMapDictionarySearch(keyWordToIdentifiersMap);
// }
return new AhoCorasickMapDictionarySearch(keyWordToIdentifiersMap);
}
protected static Map<String, List<DictionaryIdentifierWithKeyword>> computeStringIdentifiersMap(List<DictionaryModel> dictionaryModels) {
Map<String, List<DictionaryIdentifierWithKeyword>> stringToIdentifiersMap = new HashMap<>();
for (DictionaryModel model : dictionaryModels) {
// Add entries for different entity types
addEntriesToMap(stringToIdentifiersMap, model, model.isHint() ? EntityType.HINT : EntityType.ENTITY, model.getEntries(), false);
addEntriesToMap(stringToIdentifiersMap, model, EntityType.FALSE_POSITIVE, model.getFalsePositives(), false);
addEntriesToMap(stringToIdentifiersMap, model, EntityType.FALSE_RECOMMENDATION, model.getFalseRecommendations(), false);
if (model.isDossierDictionary()) {
addEntriesToMap(stringToIdentifiersMap, model, EntityType.DICTIONARY_REMOVAL, model.getEntries(), true);
}
}
return stringToIdentifiersMap;
}
private static void addEntriesToMap(Map<String, List<DictionaryIdentifierWithKeyword>> stringToIdentifiersMap,
DictionaryModel model,
EntityType entityType,
Set<DictionaryEntryModel> entries,
boolean isDeleted) {
DictionaryIdentifier identifier = new DictionaryIdentifier(model.getType(), entityType, model.isDossierDictionary(), !model.isCaseInsensitive());
List<String> values = entries.stream()
.filter(entry -> entry.isDeleted() == isDeleted)
.map(DictionaryEntry::getValue)
.toList();
for (String value : values) {
DictionaryIdentifierWithKeyword idWithKeyword = new DictionaryIdentifierWithKeyword(identifier, value);
String key = value.toLowerCase(Locale.ROOT);
stringToIdentifiersMap.computeIfAbsent(key, k -> new ArrayList<>()).add(idWithKeyword);
}
}
}

View File

@ -0,0 +1,8 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
public record DictionaryIdentifier(String type, EntityType entityType, boolean dossierDictionaryEntry, boolean caseSensitive) {
}

View File

@ -0,0 +1,51 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import org.ahocorasick.trie.PayloadEmit;
import org.ahocorasick.trie.PayloadTrie;
import java.util.Collection;
public final class DictionaryIdentifierTrie {
private final PayloadTrie<DictionaryIdentifier> trie;
private DictionaryIdentifierTrie(PayloadTrie<DictionaryIdentifier> trie) {
this.trie = trie;
}
public static class DictionaryIdentifierTrieBuilder {
private final PayloadTrie.PayloadTrieBuilder<DictionaryIdentifier> builder;
public DictionaryIdentifierTrieBuilder() {
this.builder = PayloadTrie.builder();
}
public DictionaryIdentifierTrieBuilder ignoreCase() {
builder.ignoreCase();
return this;
}
public DictionaryIdentifierTrieBuilder addKeyword(String keyword, DictionaryIdentifier payload) {
builder.addKeyword(keyword, payload);
return this;
}
public DictionaryIdentifierTrieBuilder addKeywords(Collection<String> keywords, DictionaryIdentifier payload) {
for (String keyword : keywords) {
builder.addKeyword(keyword, payload);
}
return this;
}
public DictionaryIdentifierTrie build() {
return new DictionaryIdentifierTrie(builder.build());
}
}
public Collection<PayloadEmit<DictionaryIdentifier>> parseText(CharSequence text) {
return trie.parseText(text);
}
public boolean containsMatch(CharSequence text) {
return trie.containsMatch(text);
}
}

Some files were not shown because too many files have changed in this diff Show More