Merge branch 'master' of ssh://git.iqser.com:2222/red/redaction-service

This commit is contained in:
cschabert 2021-09-22 16:11:54 +02:00
commit d9b78643fb
95 changed files with 2798 additions and 2247 deletions

View File

@ -5,7 +5,7 @@
<parent> <parent>
<groupId>com.atlassian.bamboo</groupId> <groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-parent</artifactId> <artifactId>bamboo-specs-parent</artifactId>
<version>7.1.2</version> <version>7.2.2</version>
<relativePath/> <relativePath/>
</parent> </parent>

View File

@ -13,6 +13,6 @@ RUN apt-get update \
wget cabextract xfonts-utils fonts-liberation \ wget cabextract xfonts-utils fonts-liberation \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN curl http://ftp.br.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.7_all.deb -o /tmp/ttf-mscorefonts-installer_3.7_all.deb \ RUN curl http://ftp.br.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.8_all.deb -o /tmp/ttf-mscorefonts-installer_3.8_all.deb \
&& dpkg -i /tmp/ttf-mscorefonts-installer_3.7_all.deb \ && dpkg -i /tmp/ttf-mscorefonts-installer_3.8_all.deb \
&& rm /tmp/ttf-mscorefonts-installer_3.7_all.deb \ && rm /tmp/ttf-mscorefonts-installer_3.8_all.deb \

View File

@ -5,7 +5,7 @@
<parent> <parent>
<artifactId>platform-dependency</artifactId> <artifactId>platform-dependency</artifactId>
<groupId>com.iqser.red</groupId> <groupId>com.iqser.red</groupId>
<version>1.1.2</version> <version>1.1.3</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
@ -32,7 +32,7 @@
<dependency> <dependency>
<groupId>com.iqser.red</groupId> <groupId>com.iqser.red</groupId>
<artifactId>platform-commons-dependency</artifactId> <artifactId>platform-commons-dependency</artifactId>
<version>1.3.1</version> <version>1.3.6</version>
<scope>import</scope> <scope>import</scope>
<type>pom</type> <type>pom</type>
</dependency> </dependency>

View File

@ -19,14 +19,8 @@
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.iqser.red.service</groupId> <groupId>com.iqser.red.service</groupId>
<artifactId>configuration-service-api-v1</artifactId> <artifactId>persistence-service-api-v1</artifactId>
<version>2.7.0</version> <version>0.4.0</version>
<exclusions>
<exclusion>
<groupId>com.iqser.red.service</groupId>
<artifactId>file-management-service-api-v1</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -6,6 +6,10 @@ import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import java.time.OffsetDateTime; import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@Data @Data
@Builder @Builder
@ -13,12 +17,20 @@ import java.time.OffsetDateTime;
@AllArgsConstructor @AllArgsConstructor
public class AnalyzeRequest { public class AnalyzeRequest {
private String projectId; private String dossierId;
private String fileId; private String fileId;
private String ruleSetId; private String dossierTemplateId;
private boolean reanalyseOnlyIfPossible; private boolean reanalyseOnlyIfPossible;
private ManualRedactions manualRedactions; private ManualRedactions manualRedactions;
private OffsetDateTime lastProcessed; private OffsetDateTime lastProcessed;
@Builder.Default
private Set<Integer> excludedPages = new HashSet<>();
@Builder.Default
private Set<Integer> sectionsToReanalyse = new HashSet<>();
@Builder.Default
private List<FileAttribute> fileAttributes = new ArrayList<>();
} }

View File

@ -11,20 +11,20 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor @AllArgsConstructor
public class AnalyzeResult { public class AnalyzeResult {
private String projectId; private String dossierId;
private String fileId; private String fileId;
private long duration; private long duration;
private int numberOfPages; private int numberOfPages;
private boolean hasHints;
private boolean hasRequests;
private boolean hasRedactions;
private boolean hasImages;
private boolean hasUpdates; private boolean hasUpdates;
private long dictionaryVersion; private long dictionaryVersion;
private long dossierDictionaryVersion; private long dossierDictionaryVersion;
private long rulesVersion; private long rulesVersion;
private long legalBasisVersion; private long legalBasisVersion;
private boolean wasReanalyzed;
private int analysisVersion;
} }

View File

@ -11,6 +11,7 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor @AllArgsConstructor
public class AnnotateRequest { public class AnnotateRequest {
private String projectId; private String dossierId;
private String dossierTemplateId;
private String fileId; private String fileId;
} }

View File

@ -0,0 +1,15 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Argument {
private String name;
private ArgumentType type;
}

View File

@ -0,0 +1,7 @@
package com.iqser.red.service.redaction.v1.model;
public enum ArgumentType {
INTEGER, BOOLEAN, STRING, FILE_ATTRIBUTE, REGEX, TYPE, RULE_NUMBER, LEGAL_BASIS, REFERENCE_TYPE
}

View File

@ -0,0 +1,18 @@
package com.iqser.red.service.redaction.v1.model;
import java.time.OffsetDateTime;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class Change {
private ChangeType type;
private OffsetDateTime dateTime;
}

View File

@ -1,5 +1,5 @@
package com.iqser.red.service.redaction.v1.model; package com.iqser.red.service.redaction.v1.model;
public enum ChangeType { public enum ChangeType {
ADDED, REMOVED ADDED, REMOVED, CHANGED
} }

View File

@ -0,0 +1,5 @@
package com.iqser.red.service.redaction.v1.model;
public enum Engine {
DICTIONARY, NER, RULE
}

View File

@ -0,0 +1,19 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class FileAttribute {
private String id;
private String label;
private String placeholder;
private String value;
}

View File

@ -5,6 +5,8 @@ import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
@Data @Data
@Builder @Builder
@AllArgsConstructor @AllArgsConstructor
@ -16,4 +18,8 @@ public class IdRemoval {
private Status status; private Status status;
private boolean removeFromDictionary; private boolean removeFromDictionary;
private OffsetDateTime requestDate;
private OffsetDateTime processedDate;
private OffsetDateTime softDeletedTime;
} }

View File

@ -5,6 +5,8 @@ import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
@Data @Data
@Builder @Builder
@AllArgsConstructor @AllArgsConstructor
@ -16,4 +18,8 @@ public class ManualForceRedact {
private Status status; private Status status;
private String legalBasis; private String legalBasis;
private OffsetDateTime requestDate;
private OffsetDateTime processedDate;
private OffsetDateTime softDeletedTime;
} }

View File

@ -0,0 +1,25 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class ManualImageRecategorization {
private String id;
private String user;
private Status status;
private String type;
private OffsetDateTime requestDate;
private OffsetDateTime processedDate;
private OffsetDateTime softDeletedTime;
}

View File

@ -0,0 +1,25 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class ManualLegalBasisChange {
private String id;
private String user;
private Status status;
private String legalBasis;
private OffsetDateTime requestDate;
private OffsetDateTime processedDate;
private OffsetDateTime softDeletedTime;
}

View File

@ -5,6 +5,7 @@ import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -22,11 +23,12 @@ public class ManualRedactionEntry {
private String legalBasis; private String legalBasis;
private List<Rectangle> positions = new ArrayList<>(); private List<Rectangle> positions = new ArrayList<>();
private Status status; private Status status;
private boolean addToDictionary; private boolean addToDictionary;
private String section;
private int sectionNumber;
private boolean addToDossierDictionary; private boolean addToDossierDictionary;
private OffsetDateTime requestDate;
private OffsetDateTime processedDate;
private OffsetDateTime softDeletedTime;
} }

View File

@ -1,5 +1,5 @@
package com.iqser.red.service.redaction.v1.model; package com.iqser.red.service.redaction.v1.model;
public enum ManualRedactionType { public enum ManualRedactionType {
ADD, REMOVE, FORCE_REDACT ADD, REMOVE, FORCE_REDACT, RECATEGORIZE, LEGAL_BASIS_CHANGE
} }

View File

@ -26,6 +26,12 @@ public class ManualRedactions {
@Builder.Default @Builder.Default
private Set<ManualRedactionEntry> entriesToAdd = new HashSet<>(); private Set<ManualRedactionEntry> entriesToAdd = new HashSet<>();
@Builder.Default
private Set<ManualImageRecategorization> imageRecategorizations = new HashSet<>();
@Builder.Default
private Set<ManualLegalBasisChange> manualLegalBasisChanges = new HashSet<>();
@Builder.Default @Builder.Default
private Map<String, List<Comment>> comments = new HashMap<>(); private Map<String, List<Comment>> comments = new HashMap<>();

View File

@ -1,47 +0,0 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class RedactionChangeLogEntry {
private String id;
private String type;
private String value;
private String reason;
private int matchedRule;
private String legalBasis;
private boolean redacted;
private boolean isHint;
private boolean isRecommendation;
private String section;
private float[] color;
@Builder.Default
private List<Rectangle> positions = new ArrayList<>();
private int sectionNumber;
private boolean manual;
private Status status;
private ManualRedactionType manualRedactionType;
private boolean isDictionaryEntry;
private String textBefore;
private String textAfter;
@Builder.Default
private List<Comment> comments = new ArrayList<>();
private ChangeType changeType;
private boolean isDossierDictionaryEntry;
}

View File

@ -1,27 +1,29 @@
package com.iqser.red.service.redaction.v1.model; package com.iqser.red.service.redaction.v1.model;
import com.iqser.red.service.configuration.v1.api.model.LegalBasisMapping;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
import java.util.List; import java.util.List;
import com.iqser.red.service.persistence.service.v1.api.model.data.configuration.LegalBasis;
@Data @Data
@AllArgsConstructor @AllArgsConstructor
public class RedactionLog { public class RedactionLog {
private List<RedactionLogEntry> redactionLogEntry;
private List<LegalBasisMapping> legalBasis;
private String ruleSetId; /**
* Version 0 Redaction Logs have manual redactions merged inside them
* Version 1 Redaction Logs only contain system ( rule/dictionary ) redactions. Manual Redactions are merged in at runtime.
*/
private long analysisVersion;
private List<RedactionLogEntry> redactionLogEntry;
private List<LegalBasis> legalBasis;
private long dictionaryVersion = -1; private long dictionaryVersion = -1;
private long rulesVersion = -1;
private long dossierDictionaryVersion = -1; private long dossierDictionaryVersion = -1;
private long rulesVersion = -1;
private long legalBasisVersion = -1; private long legalBasisVersion = -1;
} }

View File

@ -0,0 +1,17 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class RedactionLogChanges {
private RedactionLog redactionLog;
private boolean hasChanges;
}

View File

@ -7,13 +7,15 @@ import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
@Data @Data
@Builder @Builder
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor @AllArgsConstructor
@EqualsAndHashCode(of = "id") @EqualsAndHashCode
public class RedactionLogEntry { public class RedactionLogEntry {
private String id; private String id;
@ -34,6 +36,7 @@ public class RedactionLogEntry {
private boolean manual; private boolean manual;
private Status status; private Status status;
private ManualRedactionType manualRedactionType; private ManualRedactionType manualRedactionType;
private String manualRedactionUserId;
private boolean isDictionaryEntry; private boolean isDictionaryEntry;
private String textBefore; private String textBefore;
@ -46,7 +49,23 @@ public class RedactionLogEntry {
private int endOffset; private int endOffset;
private boolean isImage; private boolean isImage;
private boolean imageHasTransparency;
private boolean isDossierDictionaryEntry; private boolean isDossierDictionaryEntry;
private boolean excluded;
private String recategorizationType;
private String legalBasisChangeValue;
@EqualsAndHashCode.Exclude
@Builder.Default
private List<Change> changes = new ArrayList<>();
private Set<Engine> engines= new HashSet<>();
private Set<String> reference = new HashSet<>();
} }

View File

@ -1,5 +1,8 @@
package com.iqser.red.service.redaction.v1.model; package com.iqser.red.service.redaction.v1.model;
import java.util.HashSet;
import java.util.Set;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
import lombok.Data; import lombok.Data;
@ -11,8 +14,10 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor @AllArgsConstructor
public class RedactionRequest { public class RedactionRequest {
private String projectId; private String dossierId;
private String fileId; private String fileId;
private String ruleSetId; private String dossierTemplateId;
private ManualRedactions manualRedactions; private ManualRedactions manualRedactions;
@Builder.Default
private Set<Integer> excludedPages = new HashSet<>();
} }

View File

@ -0,0 +1,14 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.Data;
import java.util.ArrayList;
import java.util.List;
@Data
public class RuleBuilderModel {
private List<RuleElement> whenClauses = new ArrayList<>();
private List<RuleElement> thenConditions = new ArrayList<>();
}

View File

@ -8,15 +8,11 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
@Data @Data
@AllArgsConstructor
@NoArgsConstructor @NoArgsConstructor
public class RedactionChangeLog { @AllArgsConstructor
public class RuleElement {
private List<RedactionChangeLogEntry> redactionLogEntry = new ArrayList<>(); private String conditionName;
private List<Argument> arguments = new ArrayList<>();
private long dictionaryVersion = -1;
private long rulesVersion = -1;
private String ruleSetId;
} }

View File

@ -30,4 +30,5 @@ public class SectionRectangle {
private int numberOfParts; private int numberOfParts;
private List<CellRectangle> tableCells; private List<CellRectangle> tableCells;
} }

View File

@ -0,0 +1,18 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class StructureAnalyzeRequest {
private String dossierId;
private String fileId;
}

View File

@ -8,12 +8,6 @@ import org.springframework.web.bind.annotation.RequestBody;
public interface RedactionResource { public interface RedactionResource {
String SERVICE_NAME = "redaction-service-v1";
String RULE_SET_PARAMETER_NAME = "ruleSetId";
String RULE_SET_PATH_VARIABLE = "/{" + RULE_SET_PARAMETER_NAME + "}";
@PostMapping(value = "/annotate", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) @PostMapping(value = "/annotate", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest); AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest);
@ -26,10 +20,10 @@ public interface RedactionResource {
@PostMapping(value = "/debug/htmlTables", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) @PostMapping(value = "/debug/htmlTables", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest); RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest);
@PostMapping(value = "/rules/update" + RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE)
void updateRules(@PathVariable(RULE_SET_PARAMETER_NAME) String ruleSetId);
@PostMapping(value = "/rules/test", consumes = MediaType.APPLICATION_JSON_VALUE) @PostMapping(value = "/rules/test", consumes = MediaType.APPLICATION_JSON_VALUE)
void testRules(@RequestBody String rules); void testRules(@RequestBody String rules);
@PostMapping(value = "/redaction-log/preview", consumes = MediaType.APPLICATION_JSON_VALUE)
RedactionLog getRedactionLog(@RequestBody RedactionRequest redactionRequest);
} }

View File

@ -0,0 +1,12 @@
package com.iqser.red.service.redaction.v1.resources;
import com.iqser.red.service.redaction.v1.model.RuleBuilderModel;
import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.PostMapping;
public interface RuleBuilderResource {
@PostMapping(value = "/rule-builder-model", produces = MediaType.APPLICATION_JSON_VALUE)
RuleBuilderModel getRuleBuilderModel();
}

View File

@ -21,21 +21,6 @@
<artifactId>redaction-service-api-v1</artifactId> <artifactId>redaction-service-api-v1</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>file-management-service-api-v1</artifactId>
<version>2.7.4</version>
<exclusions>
<exclusion>
<groupId>com.iqser.red.service</groupId>
<artifactId>redaction-service-api-v1</artifactId>
</exclusion>
<exclusion>
<groupId>com.iqser.red.service</groupId>
<artifactId>configuration-service-api-v1</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency> <dependency>
<groupId>org.drools</groupId> <groupId>org.drools</groupId>
<artifactId>drools-core</artifactId> <artifactId>drools-core</artifactId>

View File

@ -1,19 +1,14 @@
package com.iqser.red.service.redaction.v1.server.classification.model; package com.iqser.red.service.redaction.v1.server.classification.model;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.model.SectionGrid; import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
@Data @Data
@NoArgsConstructor @NoArgsConstructor
public class Document { public class Document {
@ -23,20 +18,14 @@ public class Document {
private List<Header> headers = new ArrayList<>(); private List<Header> headers = new ArrayList<>();
private List<Footer> footers = new ArrayList<>(); private List<Footer> footers = new ArrayList<>();
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>(); private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
private Map<Integer, List<Entity>> entities = new HashMap<>();
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter(); private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private boolean headlines; private boolean headlines;
private List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
private SectionGrid sectionGrid = new SectionGrid(); private SectionGrid sectionGrid = new SectionGrid();
private DictionaryVersion dictionaryVersion; private DictionaryVersion dictionaryVersion;
private long rulesVersion; private long rulesVersion;
private List<SectionText> sectionText = new ArrayList<>();
private Map<Integer, Set<Image>> images = new HashMap<>();
} }

View File

@ -0,0 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
public enum Orientation {
NONE, LEFT, RIGHT
}

View File

@ -32,6 +32,7 @@ public class TextBlock extends AbstractTextContainer {
private String classification; private String classification;
public TextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) { public TextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
this.minX = minX; this.minX = minX;
this.maxX = maxX; this.maxX = maxX;

View File

@ -1,6 +1,7 @@
package com.iqser.red.service.redaction.v1.server.classification.service; package com.iqser.red.service.redaction.v1.server.classification.service;
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter; import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter; import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
@ -11,16 +12,21 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator;
import java.util.List; import java.util.List;
@Service @Service
@SuppressWarnings("all") @SuppressWarnings("all")
public class BlockificationService { public class BlockificationService {
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) { static final float THRESHOLD = 1f;
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
List<TextPositionSequence> chunkWords = new ArrayList<>(); List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>(); List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
@ -28,21 +34,46 @@ public class BlockificationService {
float minX = 1000, maxX = 0, minY = 1000, maxY = 0; float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null; TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) { for (TextPositionSequence word : textPositions) {
boolean lineSeparation = minY - word.getY2() > word.getHeight() * 1.25; boolean lineSeparation = minY - word.getY2() > word.getHeight() * 1.25;
boolean startFromTop = word.getY1() > maxY + word.getHeight(); boolean startFromTop = word.getY1() > maxY + word.getHeight();
boolean splitByX = prev != null && maxX + 50 < word.getX1() && prev.getY1() == word.getY1();
boolean newLineAfterSplit = prev != null && word.getY1() != prev.getY1() && wasSplitted && splitX1 != word.getX1();
boolean splittedByRuling = word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), verticalRulingLines) || word
.getRotation() == 0 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), horizontalRulingLines) || word
.getRotation() == 90 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), horizontalRulingLines) || word
.getRotation() == 90 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), verticalRulingLines);
if (prev != null && (lineSeparation || startFromTop || word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word if (prev != null && (lineSeparation || startFromTop || splitByX || newLineAfterSplit || splittedByRuling)) {
.getX1(), word.getY1(), verticalRulingLines) || word.getRotation() == 0 && isSplittedByRuling(minX, minY, word
.getX1(), word.getY2(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(maxX, minY, word Orientation prevOrientation = null;
.getX1(), word.getY1(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(minX, minY, word if(!chunkBlockList1.isEmpty()) {
.getX1(), word.getY2(), verticalRulingLines))) { prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
}
TextBlock cb1 = buildTextBlock(chunkWords); TextBlock cb1 = buildTextBlock(chunkWords);
chunkBlockList1.add(cb1); chunkBlockList1.add(cb1);
chunkWords = new ArrayList<>(); chunkWords = new ArrayList<>();
if (splitByX && !splittedByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getX1();
} else
if (newLineAfterSplit && !splittedByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else
if(prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !splittedByRuling)){
cb1.setOrientation(Orientation.LEFT);
}
minX = 1000; minX = 1000;
maxX = 0; maxX = 0;
minY = 1000; minY = 1000;
@ -72,9 +103,62 @@ public class BlockificationService {
chunkBlockList1.add(cb1); chunkBlockList1.add(cb1);
} }
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
TextBlock previousLeft = null;
TextBlock previousRight = null;
while (itty.hasNext()) {
TextBlock block = (TextBlock) itty.next();
if(previousLeft != null && block.getOrientation().equals(Orientation.LEFT)){
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()){
previousLeft.add(block);
itty.remove();
continue;
}
}
if(previousRight != null && block.getOrientation().equals(Orientation.RIGHT)){
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()){
previousRight.add(block);
itty.remove();
continue;
}
}
if (block.getOrientation().equals(Orientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
previousRight = block;
}
}
itty = chunkBlockList1.iterator();
TextBlock previous = null;
while (itty.hasNext()) {
TextBlock block = (TextBlock) itty.next();
if(previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), previous
.getMaxY())||
previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous
.getMaxY())){
previous.add(block);
itty.remove();
continue;
}
previous = block;
}
return new Page(chunkBlockList1); return new Page(chunkBlockList1);
} }
private boolean equalsWithThreshold(float f1, float f2){
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextBlock buildTextBlock(List<TextPositionSequence> wordBlockList) { private TextBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
@ -117,7 +201,8 @@ public class BlockificationService {
} }
private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines) { private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1,
List<Ruling> rulingLines) {
for (Ruling ruling : rulingLines) { for (Ruling ruling : rulingLines) {
if (ruling.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { if (ruling.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
@ -128,7 +213,8 @@ public class BlockificationService {
} }
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) { public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter,
boolean landscape) {
float minX = 10000; float minX = 10000;
float maxX = -100; float maxX = -100;

View File

@ -1,8 +1,9 @@
package com.iqser.red.service.redaction.v1.server.client; package com.iqser.red.service.redaction.v1.server.client;
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
import org.springframework.cloud.openfeign.FeignClient; import org.springframework.cloud.openfeign.FeignClient;
@FeignClient(name = "DictionaryResource", url = "${configuration-service.url}") import com.iqser.red.service.persistence.service.v1.api.resources.DictionaryResource;
@FeignClient(name = "DictionaryResource", url = "${persistence-service.url}")
public interface DictionaryClient extends DictionaryResource { public interface DictionaryClient extends DictionaryResource {
} }

View File

@ -0,0 +1,19 @@
package com.iqser.red.service.redaction.v1.server.client;
import java.util.List;
import java.util.Map;
import org.springframework.cloud.openfeign.FeignClient;
import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.PostMapping;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecogintionEntity;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
@FeignClient(name = "EntityRecognitionClient", url = "${entity-recognition-service.url}")
public interface EntityRecognitionClient {
@PostMapping(value = "/find_authors", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
NerEntities findAuthors(EntityRecognitionRequest entityRecognitionRequest);
}

View File

@ -1,9 +1,10 @@
package com.iqser.red.service.redaction.v1.server.client; package com.iqser.red.service.redaction.v1.server.client;
import com.iqser.red.service.file.management.v1.api.resources.FileStatusProcessingUpdateResource;
import org.springframework.cloud.openfeign.FeignClient; import org.springframework.cloud.openfeign.FeignClient;
@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${file-management-service.url}") import com.iqser.red.service.persistence.service.v1.api.resources.FileStatusProcessingUpdateResource;
@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${persistence-service.url}")
public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource { public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource {
} }

View File

@ -1,8 +1,9 @@
package com.iqser.red.service.redaction.v1.server.client; package com.iqser.red.service.redaction.v1.server.client;
import com.iqser.red.service.configuration.v1.api.resource.LegalBasisMappingResource;
import org.springframework.cloud.openfeign.FeignClient; import org.springframework.cloud.openfeign.FeignClient;
@FeignClient(name = "LegalBasisMappingResource", url = "${configuration-service.url}") import com.iqser.red.service.persistence.service.v1.api.resources.LegalBasisMappingResource;
@FeignClient(name = "LegalBasisMappingResource", url = "${persistence-service.url}")
public interface LegalBasisClient extends LegalBasisMappingResource { public interface LegalBasisClient extends LegalBasisMappingResource {
} }

View File

@ -1,8 +1,9 @@
package com.iqser.red.service.redaction.v1.server.client; package com.iqser.red.service.redaction.v1.server.client;
import com.iqser.red.service.configuration.v1.api.resource.RulesResource;
import org.springframework.cloud.openfeign.FeignClient; import org.springframework.cloud.openfeign.FeignClient;
@FeignClient(name = "RulesResource", url = "${configuration-service.url}") import com.iqser.red.service.persistence.service.v1.api.resources.RulesResource;
@FeignClient(name = "RulesResource", url = "${persistence-service.url}")
public interface RulesClient extends RulesResource { public interface RulesClient extends RulesResource {
} }

View File

@ -0,0 +1,19 @@
package com.iqser.red.service.redaction.v1.server.client.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class EntityRecogintionEntity {
private String value;
private int startOffset;
private int endOffset;
private String type;
}

View File

@ -0,0 +1,18 @@
package com.iqser.red.service.redaction.v1.server.client.model;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class EntityRecognitionRequest {
private List<EntityRecognitionSection> data;
}

View File

@ -0,0 +1,20 @@
package com.iqser.red.service.redaction.v1.server.client.model;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class EntityRecognitionResult {
@Builder.Default
private Map<Integer, List<EntityRecogintionEntity>> entities = new HashMap<>();
}

View File

@ -0,0 +1,16 @@
package com.iqser.red.service.redaction.v1.server.client.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class EntityRecognitionSection {
private int sectionNumber;
private String text;
}

View File

@ -0,0 +1,21 @@
package com.iqser.red.service.redaction.v1.server.client.model;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class NerEntities {
@Builder.Default
private Map<Integer, List<EntityRecogintionEntity>> result = new HashMap<>();
}

View File

@ -1,10 +1,7 @@
package com.iqser.red.service.redaction.v1.server.controller; package com.iqser.red.service.redaction.v1.server.controller;
import com.iqser.red.service.file.management.v1.api.model.FileType; import com.iqser.red.service.persistence.service.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest; import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.resources.RedactionResource; import com.iqser.red.service.redaction.v1.resources.RedactionResource;
import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.Page;
@ -12,6 +9,7 @@ import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationService; import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationService;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService; import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService; import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService;
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionLogMergeService;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
@ -21,7 +19,6 @@ import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RestController; import org.springframework.web.bind.annotation.RestController;
@ -39,18 +36,18 @@ public class RedactionController implements RedactionResource {
private final AnnotationService annotationService; private final AnnotationService annotationService;
private final PdfSegmentationService pdfSegmentationService; private final PdfSegmentationService pdfSegmentationService;
private final RedactionStorageService redactionStorageService; private final RedactionStorageService redactionStorageService;
private final RedactionLogMergeService redactionLogMergeService;
public AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest) { public AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(annotateRequest.getProjectId(), annotateRequest.getFileId(), FileType.ORIGIN)); var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(annotateRequest.getDossierId(), annotateRequest.getFileId(), FileType.ORIGIN));
var redactionLog = redactionStorageService.getRedactionLog(annotateRequest.getProjectId(), annotateRequest.getFileId()); var redactionLog = redactionStorageService.getRedactionLog(annotateRequest.getDossierId(), annotateRequest.getFileId());
var sectionsGrid = redactionStorageService.getSectionGrid(annotateRequest.getProjectId(), annotateRequest.getFileId()); var sectionsGrid = redactionStorageService.getSectionGrid(annotateRequest.getDossierId(), annotateRequest.getFileId());
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
pdDocument.setAllSecurityToBeRemoved(true); pdDocument.setAllSecurityToBeRemoved(true);
dictionaryService.updateDictionary(redactionLog.getRuleSetId(), annotateRequest.getProjectId()); dictionaryService.updateDictionary(annotateRequest.getDossierTemplateId(), annotateRequest.getDossierId());
annotationService.annotate(pdDocument, redactionLog, sectionsGrid); annotationService.annotate(pdDocument, redactionLog, sectionsGrid);
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) { try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
@ -66,11 +63,11 @@ public class RedactionController implements RedactionResource {
@Override @Override
public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) { public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
try { try {
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) { try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true); pdDocument.setAllSecurityToBeRemoved(true);
@ -91,11 +88,11 @@ public class RedactionController implements RedactionResource {
@Override @Override
public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) { public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
try { try {
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) { try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true); pdDocument.setAllSecurityToBeRemoved(true);
@ -120,7 +117,7 @@ public class RedactionController implements RedactionResource {
Document classifiedDoc; Document classifiedDoc;
try { try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true); classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true);
} catch (Exception e) { } catch (Exception e) {
throw new RedactionException(e); throw new RedactionException(e);
@ -141,20 +138,28 @@ public class RedactionController implements RedactionResource {
} }
@Override
public void updateRules(@PathVariable(RULE_SET_PARAMETER_NAME) String ruleSetId) {
droolsExecutionService.updateRules(ruleSetId);
}
@Override @Override
public void testRules(@RequestBody String rules) { public void testRules(@RequestBody String rules) {
droolsExecutionService.testRules(rules); droolsExecutionService.testRules(rules);
} }
@Override
public RedactionLog getRedactionLog(RedactionRequest redactionRequest) {
log.info("Requested preview for: {}", redactionRequest);
dictionaryService.updateDictionary(redactionRequest.getDossierTemplateId(), redactionRequest.getDossierId());
var redactionLog = redactionStorageService.getRedactionLog(redactionRequest.getDossierId(), redactionRequest.getFileId());
log.info("Loaded redaction log with computationalVersion: {}", redactionLog.getAnalysisVersion());
if (redactionLog.getAnalysisVersion() == 0) {
// old redaction logs are returned directly
return redactionLog;
} else {
return redactionLogMergeService.mergeRedactionLogData(redactionLog, redactionRequest.getDossierTemplateId(), redactionRequest.getManualRedactions(), redactionRequest.getExcludedPages());
}
}
private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException { private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException {

View File

@ -0,0 +1,21 @@
package com.iqser.red.service.redaction.v1.server.controller;
import com.iqser.red.service.redaction.v1.model.RuleBuilderModel;
import com.iqser.red.service.redaction.v1.resources.RuleBuilderResource;
import com.iqser.red.service.redaction.v1.server.redaction.rulebuilder.RuleBuilderModelService;
import lombok.RequiredArgsConstructor;
import org.springframework.web.bind.annotation.RestController;
@RestController
@RequiredArgsConstructor
public class RuleBuilderController implements RuleBuilderResource {
private final RuleBuilderModelService ruleBuilderModelService;
@Override
public RuleBuilderModel getRuleBuilderModel() {
return ruleBuilderModelService.getRuleBuilderModel();
}
}

View File

@ -46,6 +46,17 @@ public class PDFAreaTextStripper extends PDFTextStripperByArea {
startIndex = i; startIndex = i;
} }
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode() .getUnicode()
.equals("\u00A0")) && i <= textPositions.size() - 2) { .equals("\u00A0")) && i <= textPositions.size() - 2) {

View File

@ -189,18 +189,16 @@ public class PDFLinesTextStripper extends PDFTextStripper {
COSName objectName = (COSName) arguments.get(0); COSName objectName = (COSName) arguments.get(0);
PDXObject xobject = getResources().getXObject(objectName); PDXObject xobject = getResources().getXObject(objectName);
if (xobject instanceof PDImageXObject) { if (xobject instanceof PDImageXObject) {
PDImageXObject pdfImage = (PDImageXObject) xobject; PDImageXObject image = (PDImageXObject)xobject;
Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
Rectangle2D imageBounds = calculateImagePosition(pdfImage); Rectangle2D rect = new Rectangle2D.Float(ctmNew.getTranslateX(), ctmNew.getTranslateY(), ctmNew.getScaleX(), ctmNew.getScaleY());
Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds
.getWidth(), (float) imageBounds.getHeight());
// Memory Hack - sofReference kills me // Memory Hack - sofReference kills me
FieldUtils.writeField(pdfImage, "cachedImageSubsampling", -1, true); FieldUtils.writeField(image, "cachedImageSubsampling", -1, true);
if (rect.getHeight() > 2 && rect.getWidth() > 2) { if (rect.getHeight() > 2 && rect.getWidth() > 2) {
this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber)); this.images.add(new PdfImage(image.getImage(), rect, pageNumber, image.getImage().getColorModel().hasAlpha()));
} }
} }
} catch (Exception e) { } catch (Exception e) {
@ -209,21 +207,6 @@ public class PDFLinesTextStripper extends PDFTextStripper {
} }
private Rectangle2D calculateImagePosition(PDImageXObject pdfImage) throws IOException {
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
Rectangle2D imageBounds = pdfImage.getImage().getRaster().getBounds();
AffineTransform imageTransform = new AffineTransform(ctm.createAffineTransform());
imageTransform.scale(1.0 / pdfImage.getWidth(), -1.0 / pdfImage.getHeight());
imageTransform.translate(0, -pdfImage.getHeight());
AffineTransform pageTransform = new AffineTransform();
pageTransform.concatenate(imageTransform);
return pageTransform.createTransformedShape(imageBounds).getBounds2D();
}
private float floatValue(COSBase value) { private float floatValue(COSBase value) {
@ -300,6 +283,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
startIndex = i; startIndex = i;
} }
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode() .getUnicode()
.equals("\u00A0")) && i <= textPositions.size() - 2) { .equals("\u00A0")) && i <= textPositions.size() - 2) {

View File

@ -21,7 +21,7 @@ public class MessagingConfiguration {
return QueueBuilder.durable(REDACTION_QUEUE) return QueueBuilder.durable(REDACTION_QUEUE)
.withArgument("x-dead-letter-exchange", "") .withArgument("x-dead-letter-exchange", "")
.withArgument("x-dead-letter-routing-key", REDACTION_QUEUE) .withArgument("x-dead-letter-routing-key", REDACTION_DQL)
.maxPriority(2) .maxPriority(2)
.build(); .build();
} }

View File

@ -4,10 +4,14 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult; import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest;
import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService; import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.service.NerAnalyserService;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.rabbit.annotation.RabbitHandler; import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener; import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -21,8 +25,10 @@ import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfigura
public class RedactionMessageReceiver { public class RedactionMessageReceiver {
private final ObjectMapper objectMapper; private final ObjectMapper objectMapper;
private final ReanalyzeService reanalyzeService; private final AnalyzeService analyzeService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
private final NerAnalyserService nerAnalyserService;
@RabbitHandler @RabbitHandler
@RabbitListener(queues = REDACTION_QUEUE) @RabbitListener(queues = REDACTION_QUEUE)
@ -32,15 +38,25 @@ public class RedactionMessageReceiver {
log.info("Processing analyze request: {}", analyzeRequest); log.info("Processing analyze request: {}", analyzeRequest);
AnalyzeResult result; AnalyzeResult result;
if (analyzeRequest.isReanalyseOnlyIfPossible()) { if (analyzeRequest.isReanalyseOnlyIfPossible()) {
result = reanalyzeService.reanalyze(analyzeRequest); result = analyzeService.reanalyze(analyzeRequest);
log.info("Successfully reanalyzed dossier {} file {} took: {}", analyzeRequest.getDossierId(), analyzeRequest
.getFileId(), result.getDuration());
} else { } else {
result = reanalyzeService.analyze(analyzeRequest); // TODO Seperate stucture analysis by other queue
} analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(analyzeRequest.getDossierId(), analyzeRequest.getFileId()));
log.info("Successfully analyzed {}", analyzeRequest);
fileStatusProcessingUpdateClient.analysisSuccessful(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), result); // TODO NerEntities should be computed and stored in entity-recognition-service, should be triggered by a seperate queue after structure analysis
nerAnalyserService.computeNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
result = analyzeService.analyze(analyzeRequest);
log.info("Successfully analyzed dossier {} file {} took: {}", analyzeRequest.getDossierId(), analyzeRequest.getFileId(), result
.getDuration());
} }
fileStatusProcessingUpdateClient.analysisSuccessful(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), result);
}
@RabbitHandler @RabbitHandler
@RabbitListener(queues = REDACTION_DQL) @RabbitListener(queues = REDACTION_DQL)
public void receiveAnalyzeRequestDQL(String in) throws JsonProcessingException { public void receiveAnalyzeRequestDQL(String in) throws JsonProcessingException {
@ -48,7 +64,7 @@ public class RedactionMessageReceiver {
var analyzeRequest = objectMapper.readValue(in, AnalyzeRequest.class); var analyzeRequest = objectMapper.readValue(in, AnalyzeRequest.class);
log.info("Failed to process analyze request: {}", analyzeRequest); log.info("Failed to process analyze request: {}", analyzeRequest);
fileStatusProcessingUpdateClient.analysisFailed(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); fileStatusProcessingUpdateClient.analysisFailed(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
} }
} }

View File

@ -1,7 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.model; package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
@ -9,6 +8,8 @@ import java.io.Serializable;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.iqser.red.service.persistence.service.v1.api.model.data.configuration.DictionaryEntry;
@Data @Data
@AllArgsConstructor @AllArgsConstructor
public class DictionaryModel implements Serializable { public class DictionaryModel implements Serializable {

View File

@ -10,7 +10,7 @@ import java.util.Map;
@Data @Data
public class DictionaryRepresentation { public class DictionaryRepresentation {
private String ruleSetId; private String dossierTemplateId;
private long dictionaryVersion = -1; private long dictionaryVersion = -1;
private List<DictionaryModel> dictionary = new ArrayList<>(); private List<DictionaryModel> dictionary = new ArrayList<>();
private float[] defaultColor; private float[] defaultColor;

View File

@ -11,6 +11,6 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor @AllArgsConstructor
public class DictionaryVersion { public class DictionaryVersion {
long rulesetVersion; long dossierTemplateVersion;
long dossierVersion; long dossierVersion;
} }

View File

@ -1,16 +1,19 @@
package com.iqser.red.service.redaction.v1.server.redaction.model; package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
@Data @Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true) @EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Entity { public class Entity implements ReasonHolder {
private final String word; private final String word;
private final String type; private final String type;
@ -39,8 +42,15 @@ public class Entity {
private boolean isDossierDictionaryEntry; private boolean isDossierDictionaryEntry;
private Set<Engine> engines = new HashSet<>();
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter, Integer start, Integer end, boolean isDossierDictionaryEntry) { private Set<Entity> references = new HashSet<>();
public Entity(String word, String type, boolean redaction, String redactionReason,
List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber,
String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter, Integer start,
Integer end, boolean isDossierDictionaryEntry, Set<Engine> engines, Set<Entity> references) {
this.word = word; this.word = word;
this.type = type; this.type = type;
@ -57,10 +67,13 @@ public class Entity {
this.start = start; this.start = start;
this.end = end; this.end = end;
this.isDossierDictionaryEntry = isDossierDictionaryEntry; this.isDossierDictionaryEntry = isDossierDictionaryEntry;
this.engines = engines;
this.references = references;
} }
public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber, boolean isDictionaryEntry, boolean isDossierDictionaryEntry) { public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber,
boolean isDictionaryEntry, boolean isDossierDictionaryEntry, Engine engine) {
this.word = word; this.word = word;
this.type = type; this.type = type;
@ -70,6 +83,8 @@ public class Entity {
this.sectionNumber = sectionNumber; this.sectionNumber = sectionNumber;
this.isDictionaryEntry = isDictionaryEntry; this.isDictionaryEntry = isDictionaryEntry;
this.isDossierDictionaryEntry = isDossierDictionaryEntry; this.isDossierDictionaryEntry = isDossierDictionaryEntry;
this.engines.add(engine);
} }
} }

View File

@ -9,7 +9,7 @@ import lombok.NoArgsConstructor;
@Builder @Builder
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor @AllArgsConstructor
public class Image { public class Image implements ReasonHolder {
private String type; private String type;
private RedRectangle2D position; private RedRectangle2D position;
@ -20,5 +20,6 @@ public class Image {
private int sectionNumber; private int sectionNumber;
private String section; private String section;
private int page; private int page;
private boolean hasTransparency;
} }

View File

@ -0,0 +1,23 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@Data
@AllArgsConstructor
public class PageEntities {
@Builder.Default
private Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
@Builder.Default
private Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
}

View File

@ -18,14 +18,16 @@ public class PdfImage {
private RedRectangle2D position; private RedRectangle2D position;
private ImageType imageType; private ImageType imageType;
private boolean isAppendedToParagraph; private boolean isAppendedToParagraph;
private boolean hasTransparency;
@NonNull @NonNull
private int page; private int page;
public PdfImage(BufferedImage image, Rectangle2D position, int page) { public PdfImage(BufferedImage image, Rectangle2D position, int page, boolean hasTransparency) {
this.image = image; this.image = image;
this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight()); this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight());
this.page = page; this.page = page;
this.hasTransparency = hasTransparency;
} }
} }

View File

@ -0,0 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
public interface ReasonHolder {
String getRedactionReason();
void setRedactionReason(String reason);
boolean isRedaction();
void setRedaction(boolean value);
}

View File

@ -1,5 +1,8 @@
package com.iqser.red.service.redaction.v1.server.redaction.model; package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.model.ArgumentType;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
@ -8,11 +11,11 @@ import lombok.Data;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.util.Collection; import java.lang.annotation.ElementType;
import java.util.HashMap; import java.lang.annotation.Retention;
import java.util.HashSet; import java.lang.annotation.RetentionPolicy;
import java.util.Map; import java.lang.annotation.Target;
import java.util.Set; import java.util.*;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -52,8 +55,49 @@ public class Section {
@Builder.Default @Builder.Default
private Set<Image> images = new HashSet<>(); private Set<Image> images = new HashSet<>();
@Builder.Default
private List<FileAttribute> fileAttributes = new ArrayList<>();
public boolean rowEquals(String headerName, String value) {
@WhenCondition
public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())).findFirst().isPresent();
}
@WhenCondition
public boolean fileAttributeByPlaceholderEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String placeholder,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())).findFirst().isPresent();
}
@WhenCondition
public boolean fileAttributeByLabelEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String label,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())).findFirst().isPresent();
}
@WhenCondition
public boolean fileAttributeByIdEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
}
@WhenCondition
public boolean fileAttributeByPlaceholderEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String placeholder,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
}
@WhenCondition
public boolean fileAttributeByLabelEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String label,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
}
@WhenCondition
public boolean rowEquals(@Argument(ArgumentType.STRING) String headerName,
@Argument(ArgumentType.STRING) String value) {
String cleanHeaderName = headerName.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", ""); String cleanHeaderName = headerName.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
@ -62,33 +106,36 @@ public class Section {
.equals(value); .equals(value);
} }
@WhenCondition
public boolean hasTableHeader(String headerName) { public boolean hasTableHeader(@Argument(ArgumentType.STRING) String headerName) {
String cleanHeaderName = headerName.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", ""); String cleanHeaderName = headerName.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
return tabularData != null && tabularData.containsKey(cleanHeaderName); return tabularData != null && tabularData.containsKey(cleanHeaderName);
} }
@WhenCondition
public boolean matchesType(String type) { public boolean matchesType(@Argument(ArgumentType.TYPE) String type) {
return entities.stream().anyMatch(entity -> entity.getType().equals(type)); return entities.stream().anyMatch(entity -> entity.getType().equals(type));
} }
@WhenCondition
public boolean matchesImageType(String type) { public boolean matchesImageType(@Argument(ArgumentType.TYPE) String type) {
return images.stream().anyMatch(image -> image.getType().equals(type)); return images.stream().anyMatch(image -> image.getType().equals(type));
} }
@WhenCondition
public boolean headlineContainsWord(String word) { public boolean headlineContainsWord(@Argument(ArgumentType.STRING) String word) {
return StringUtils.containsIgnoreCase(headline, word); return StringUtils.containsIgnoreCase(headline, word);
} }
@ThenAction
public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group) { public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group) {
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
@ -115,8 +162,11 @@ public class Section {
EntitySearchUtils.removeEntitiesContainedInLarger(entities); EntitySearchUtils.removeEntitiesContainedInLarger(entities);
} }
@ThenAction
public void redactImage(String type, int ruleNumber, String reason, String legalBasis) { public void redactImage(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
images.forEach(image -> { images.forEach(image -> {
if (image.getType().equals(type)) { if (image.getType().equals(type)) {
@ -128,8 +178,11 @@ public class Section {
}); });
} }
@ThenAction
public void redact(String type, int ruleNumber, String reason, String legalBasis) { public void redact(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
@ -144,8 +197,10 @@ public class Section {
}); });
} }
@ThenAction
public void redactNotImage(String type, int ruleNumber, String reason) { public void redactNotImage(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason) {
images.forEach(image -> { images.forEach(image -> {
if (image.getType().equals(type)) { if (image.getType().equals(type)) {
@ -156,8 +211,10 @@ public class Section {
}); });
} }
@ThenAction
public void redactNot(String type, int ruleNumber, String reason) { public void redactNot(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason) {
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
@ -172,8 +229,35 @@ public class Section {
} }
public void expandToHintAnnotationByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group, @ThenAction
String asType) { public void redactNotAndReference(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.REFERENCE_TYPE) String referenceType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason) {
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
Set<Entity> references = entities.stream().filter(entity -> entity.getType().equals(referenceType)).collect(Collectors.toSet());
entities.forEach(entity -> {
if (entity.getType().equals(type) || hasRecommendationDictionary && entity.getType()
.equals(RECOMMENDATION_PREFIX + type)) {
entity.setRedaction(false);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
entity.setReferences(references);
}
});
}
@ThenAction
public void expandToHintAnnotationByRegEx(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.STRING) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.TYPE) String asType) {
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
@ -198,8 +282,11 @@ public class Section {
EntitySearchUtils.removeEntitiesContainedInLarger(entities); EntitySearchUtils.removeEntitiesContainedInLarger(entities);
} }
@ThenAction
public void addHintAnnotationByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType) { public void addHintAnnotationByRegEx(@Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.TYPE) String asType) {
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
@ -214,8 +301,12 @@ public class Section {
} }
} }
@ThenAction
public void redactIfPrecededBy(String prefix, String type, int ruleNumber, String reason, String legalBasis) { public void redactIfPrecededBy(@Argument(ArgumentType.STRING) String prefix,
@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
entities.forEach(entity -> { entities.forEach(entity -> {
if (entity.getType().equals(type) && searchText.indexOf(prefix + entity.getWord()) != 1) { if (entity.getType().equals(type) && searchText.indexOf(prefix + entity.getWord()) != 1) {
@ -227,23 +318,32 @@ public class Section {
}); });
} }
@ThenAction
public void addHintAnnotation(String value, String asType) { public void addHintAnnotation(@Argument(ArgumentType.STRING) String value,
@Argument(ArgumentType.TYPE) String asType) {
Set<Entity> found = findEntities(value.trim(), asType, true, false, 0, null, null); Set<Entity> found = findEntities(value.trim(), asType, true, false, 0, null, null);
EntitySearchUtils.addEntitiesIgnoreRank(entities, found); EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
} }
@ThenAction
public void addRedaction(String value, String asType, int ruleNumber, String reason, String legalBasis) { public void addRedaction(@Argument(ArgumentType.STRING) String value,
@Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
Set<Entity> found = findEntities(value.trim(), asType, true, true, ruleNumber, reason, legalBasis); Set<Entity> found = findEntities(value.trim(), asType, true, true, ruleNumber, reason, legalBasis);
EntitySearchUtils.addEntitiesIgnoreRank(entities, found); EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
} }
@ThenAction
public void redactLineAfter(String start, String asType, int ruleNumber, boolean redactEverywhere, String reason, public void redactLineAfter(@Argument(ArgumentType.STRING) String start,
String legalBasis) { @Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.BOOLEAN) boolean redactEverywhere,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
String[] values = StringUtils.substringsBetween(text, start, "\n"); String[] values = StringUtils.substringsBetween(text, start, "\n");
@ -261,8 +361,9 @@ public class Section {
} }
} }
@ThenAction
public void recommendLineAfter(String start, String asType) { public void recommendLineAfter(@Argument(ArgumentType.STRING) String start,
@Argument(ArgumentType.TYPE) String asType) {
String[] values = StringUtils.substringsBetween(text, start, "\n"); String[] values = StringUtils.substringsBetween(text, start, "\n");
@ -285,9 +386,14 @@ public class Section {
} }
} }
@ThenAction
public void redactByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType, int ruleNumber, public void redactByRegEx(@Argument(ArgumentType.REGEX) String pattern,
String reason, String legalBasis) { @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
@ -302,8 +408,11 @@ public class Section {
} }
} }
@ThenAction
public void addRecommendationByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType) { public void addRecommendationByRegEx(@Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.TYPE) String asType) {
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
@ -317,9 +426,14 @@ public class Section {
} }
} }
@ThenAction
public void redactAndRecommendByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType, public void redactAndRecommendByRegEx(@Argument(ArgumentType.REGEX) String pattern,
int ruleNumber, String reason, String legalBasis) { @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
@ -334,9 +448,14 @@ public class Section {
} }
} }
@ThenAction
public void redactBetween(String start, String stop, String asType, int ruleNumber, boolean redactEverywhere, public void redactBetween(@Argument(ArgumentType.STRING) String start,
String reason, String legalBasis) { @Argument(ArgumentType.STRING) String stop,
@Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.BOOLEAN) boolean redactEverywhere,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
String[] values = StringUtils.substringsBetween(searchText, start, stop); String[] values = StringUtils.substringsBetween(searchText, start, stop);
@ -355,9 +474,14 @@ public class Section {
} }
} }
@ThenAction
public void redactLinesBetween(String start, String stop, String asType, int ruleNumber, boolean redactEverywhere, public void redactLinesBetween(@Argument(ArgumentType.STRING) String start,
String reason, String legalBasis) { @Argument(ArgumentType.STRING) String stop,
@Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.BOOLEAN) boolean redactEverywhere,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
String[] values = StringUtils.substringsBetween(text, start, stop); String[] values = StringUtils.substringsBetween(text, start, stop);
@ -384,34 +508,48 @@ public class Section {
} }
} }
@ThenAction
public void highlightCell(String cellHeader, int ruleNumber, String type) { public void highlightCell(@Argument(ArgumentType.STRING) String cellHeader,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.TYPE) String type) {
annotateCell(cellHeader, ruleNumber, type, false, false, null, null); annotateCell(cellHeader, ruleNumber, type, false, false, null, null);
} }
@ThenAction
public void redactCell(String cellHeader, int ruleNumber, String type, boolean addAsRecommendations, String reason, public void redactCell(@Argument(ArgumentType.STRING) String cellHeader,
String legalBasis) { @Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.BOOLEAN) boolean addAsRecommendations,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
annotateCell(cellHeader, ruleNumber, type, true, addAsRecommendations, reason, legalBasis); annotateCell(cellHeader, ruleNumber, type, true, addAsRecommendations, reason, legalBasis);
} }
@ThenAction
public void redactNotCell(String cellHeader, int ruleNumber, String type, boolean addAsRecommendations, public void redactNotCell(@Argument(ArgumentType.STRING) String cellHeader,
String reason) { @Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.BOOLEAN) boolean addAsRecommendations,
@Argument(ArgumentType.STRING) String reason) {
annotateCell(cellHeader, ruleNumber, type, false, addAsRecommendations, reason, null); annotateCell(cellHeader, ruleNumber, type, false, addAsRecommendations, reason, null);
} }
private Set<Entity> findEntities(String value, String asType, boolean caseInsensitive, boolean redacted, private Set<Entity> findEntities(@Argument(ArgumentType.STRING) String value,
int ruleNumber, String reason, String legalBasis) { @Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.BOOLEAN) boolean caseInsensitive,
@Argument(ArgumentType.BOOLEAN) boolean redacted,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
String text = caseInsensitive ? searchText.toLowerCase() : searchText; String text = caseInsensitive ? searchText.toLowerCase() : searchText;
String searchValue = caseInsensitive ? value.toLowerCase() : value; String searchValue = caseInsensitive ? value.toLowerCase() : value;
Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, true, false); Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, false, false, Engine.RULE);
found.forEach(entity -> { found.forEach(entity -> {
if (redacted) { if (redacted) {
@ -437,7 +575,7 @@ public class Section {
} else { } else {
String word = value.toString(); String word = value.toString();
Entity entity = new Entity(word, type, value.getRowSpanStart(), value.getRowSpanStart() + word.length(), headline, sectionNumber, false, false); Entity entity = new Entity(word, type, value.getRowSpanStart(), value.getRowSpanStart() + word.length(), headline, sectionNumber, false, false, Engine.RULE);
entity.setRedaction(redact); entity.setRedaction(redact);
entity.setMatchedRule(ruleNumber); entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason); entity.setRedactionReason(reason);
@ -475,6 +613,25 @@ public class Section {
} }
} }
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.METHOD)
public @interface WhenCondition {
}
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.METHOD)
public @interface ThenAction {
}
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.PARAMETER)
public @interface Argument {
ArgumentType value() default ArgumentType.STRING;
}
} }

View File

@ -0,0 +1,36 @@
package com.iqser.red.service.redaction.v1.server.redaction.rulebuilder;
import com.iqser.red.service.redaction.v1.model.Argument;
import com.iqser.red.service.redaction.v1.model.RuleBuilderModel;
import com.iqser.red.service.redaction.v1.model.RuleElement;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import org.springframework.stereotype.Service;
import java.lang.reflect.Method;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
@Service
public class RuleBuilderModelService {
public RuleBuilderModel getRuleBuilderModel() {
var whenConditions = Arrays.stream(Section.class.getDeclaredMethods()).filter(m -> m.isAnnotationPresent(Section.WhenCondition.class)).collect(Collectors.toList());
var thenActions = Arrays.stream(Section.class.getDeclaredMethods()).filter(m -> m.isAnnotationPresent(Section.ThenAction.class)).collect(Collectors.toList());
RuleBuilderModel ruleBuilderModel = new RuleBuilderModel();
ruleBuilderModel.setWhenClauses(whenConditions.stream().map(c -> new RuleElement(c.getName(), toArguments(c))).collect(Collectors.toList()));
ruleBuilderModel.setThenConditions(thenActions.stream().map(c -> new RuleElement(c.getName(), toArguments(c))).collect(Collectors.toList()));
return ruleBuilderModel;
}
private List<Argument> toArguments(Method c) {
return Arrays.stream(c.getParameters())
.map(parameter -> new Argument(parameter.getName(), parameter.getAnnotation(Section.Argument.class).value()))
.collect(Collectors.toList());
}
}

View File

@ -1,49 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionChangeLog;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import org.springframework.stereotype.Service;
@Service
public class AnalyzeResponseService {
public AnalyzeResult createAnalyzeResponse(String projectId, String fileId, long duration, int pageCount, RedactionLog redactionLog, RedactionChangeLog redactionChangeLog) {
boolean hasHints = redactionLog.getRedactionLogEntry().stream().anyMatch(RedactionLogEntry::isHint);
boolean hasRequests = redactionLog.getRedactionLogEntry()
.stream()
.anyMatch(entry -> entry.isManual() && entry.getStatus()
.equals(com.iqser.red.service.redaction.v1.model.Status.REQUESTED));
boolean hasRedactions = redactionLog.getRedactionLogEntry()
.stream()
.anyMatch(entry -> entry.isRedacted() && !entry.isManual() || entry.isManual() && entry.getStatus()
.equals(com.iqser.red.service.redaction.v1.model.Status.APPROVED));
boolean hasImages = redactionLog.getRedactionLogEntry()
.stream()
.anyMatch(entry -> entry.isHint() && entry.getType().equals("image"));
boolean hasUpdates = redactionChangeLog != null && redactionChangeLog.getRedactionLogEntry() != null && !redactionChangeLog
.getRedactionLogEntry()
.isEmpty() && redactionChangeLog.getRedactionLogEntry().stream().anyMatch(entry -> !entry.getType().equals("false_positive"));
return AnalyzeResult.builder()
.projectId(projectId)
.fileId(fileId)
.duration(duration)
.numberOfPages(pageCount)
.hasHints(hasHints)
.hasRedactions(hasRedactions)
.hasRequests(hasRequests)
.hasImages(hasImages)
.hasUpdates(hasUpdates)
.rulesVersion(redactionLog.getRulesVersion())
.dictionaryVersion(redactionLog.getDictionaryVersion())
.legalBasisVersion(redactionLog.getLegalBasisVersion())
.dossierDictionaryVersion(redactionLog.getDossierDictionaryVersion())
.build();
}
}

View File

@ -0,0 +1,283 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import com.iqser.red.service.persistence.service.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualImageRecategorization;
import com.iqser.red.service.redaction.v1.model.ManualLegalBasisChange;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class AnalyzeService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
private final EntityRedactionService entityRedactionService;
private final RedactionLogCreatorService redactionLogCreatorService;
private final RedactionStorageService redactionStorageService;
private final PdfSegmentationService pdfSegmentationService;
private final RedactionChangeLogService redactionChangeLogService;
private final LegalBasisClient legalBasisClient;
private final RedactionServiceSettings redactionServiceSettings;
private final SectionTextBuilderService sectionTextBuilderService;
private final SectionGridCreatorService sectionGridCreatorService;
private final NerAnalyserService nerAnalyserService;
public void analyzeDocumentStructure(StructureAnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var pageCount = 0;
Document classifiedDoc;
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest
.getDossierId(), analyzeRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
pageCount = classifiedDoc.getPages().size();
} catch (Exception e) {
throw new RedactionException(e);
}
List<SectionText> sectionTexts = sectionTextBuilderService.buildSectionText(classifiedDoc);
sectionGridCreatorService.createSectionGrid(classifiedDoc, pageCount);
Text text = new Text(pageCount, sectionTexts);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.TEXT, text);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc
.getSectionGrid());
log.info("Document structure analysis successful, took: {}", System.currentTimeMillis() - startTime);
}
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var text = redactionStorageService.getText(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
var nerEntities = redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
if(redactionServiceSettings.isEnableEntityRecognition() && nerEntities == null){
nerAnalyserService.computeNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
nerEntities = redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
}
dictionaryService.updateDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId());
long rulesVersion = droolsExecutionService.getRulesVersion(analyzeRequest.getDossierTemplateId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest
.getDossierId());
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, text.getSectionTexts(), kieContainer, analyzeRequest, nerEntities);
dictionaryService.updateExternalDictionary(dictionary, analyzeRequest.getDossierTemplateId());
List<RedactionLogEntry> redactionLogEntries = redactionLogCreatorService.createRedactionLog(pageEntities, text.getNumberOfPages(), analyzeRequest
.getDossierTemplateId());
var legalBasis = legalBasisClient.getLegalBasisMapping(analyzeRequest.getDossierTemplateId());
var redactionLog = new RedactionLog(redactionServiceSettings.getAnalysisVersion(), redactionLogEntries, legalBasis, dictionary
.getVersion()
.getDossierTemplateVersion(), dictionary.getVersion()
.getDossierVersion(), rulesVersion, legalBasisClient.getVersion(analyzeRequest.getDossierTemplateId()));
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionary.getVersion(), false);
}
@SneakyThrows
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
var text = redactionStorageService.getText(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
// not yet ready for reanalysis
if (redactionLog == null || text == null || text.getNumberOfPages() == 0) {
return analyze(analyzeRequest);
}
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getDossierTemplateId(), new DictionaryVersion(redactionLog
.getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getDossierId());
Set<Integer> sectionsToReanalyse = !analyzeRequest.getSectionsToReanalyse()
.isEmpty() ? analyzeRequest.getSectionsToReanalyse() : findSectionsToReanalyse(dictionaryIncrement, redactionLog, text, analyzeRequest);
if (sectionsToReanalyse.isEmpty()) {
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
}
var nerEntities = redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
if(redactionServiceSettings.isEnableEntityRecognition() && nerEntities == null){
nerAnalyserService.computeNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
nerEntities = redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
}
List<SectionText> reanalysisSections = text.getSectionTexts()
.stream()
.filter(sectionText -> sectionsToReanalyse.contains(sectionText.getSectionNumber()))
.collect(Collectors.toList());
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest
.getDossierId());
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, reanalysisSections, kieContainer, analyzeRequest, nerEntities);
var newRedactionLogEntries = redactionLogCreatorService.createRedactionLog(pageEntities, text.getNumberOfPages(), analyzeRequest
.getDossierTemplateId());
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()));
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
}
private Set<Integer> findSectionsToReanalyse(DictionaryIncrement dictionaryIncrement, RedactionLog redactionLog,
Text text, AnalyzeRequest analyzeRequest) {
long start = System.currentTimeMillis();
Set<String> relevantManuallyModifiedAnnotationIds = getRelevantManuallyModifiedAnnotationIds(analyzeRequest.getManualRedactions());
Set<Integer> sectionsToReanalyse = new HashSet<>();
Map<Integer, Set<Image>> imageEntries = new HashMap<>();
for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) {
if (entry.isManual() || relevantManuallyModifiedAnnotationIds.contains(entry.getId())) {
sectionsToReanalyse.add(entry.getSectionNumber());
}
if (entry.isImage() || entry.getType().equals("image")) {
imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry));
}
}
for (SectionText sectionText : text.getSectionTexts()) {
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
sectionsToReanalyse.add(sectionText.getSectionNumber());
}
}
log.info("Should reanalyze {} sections for request: {}, took: {}", sectionsToReanalyse.size(), analyzeRequest, System.currentTimeMillis() - start);
return sectionsToReanalyse;
}
private AnalyzeResult finalizeAnalysis(@RequestBody AnalyzeRequest analyzeRequest, long startTime,
RedactionLog redactionLog, Text text, DictionaryVersion dictionaryVersion,
boolean isReanalysis) {
redactionLog.setDictionaryVersion(dictionaryVersion.getDossierTemplateVersion());
redactionLog.setDossierDictionaryVersion(dictionaryVersion.getDossierVersion());
excludeExcludedPages(redactionLog, analyzeRequest.getExcludedPages());
var redactionLogChange = redactionChangeLogService.computeChanges(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLogChange
.getRedactionLog());
long duration = System.currentTimeMillis() - startTime;
return AnalyzeResult.builder()
.dossierId(analyzeRequest.getDossierId())
.fileId(analyzeRequest.getFileId())
.duration(duration)
.numberOfPages(text.getNumberOfPages())
.hasUpdates(redactionLogChange.isHasChanges())
.analysisVersion(redactionServiceSettings.getAnalysisVersion())
.rulesVersion(redactionLog.getRulesVersion())
.dictionaryVersion(redactionLog.getDictionaryVersion())
.legalBasisVersion(redactionLog.getLegalBasisVersion())
.dossierDictionaryVersion(redactionLog.getDossierDictionaryVersion())
.wasReanalyzed(isReanalysis)
.build();
}
private Set<String> getRelevantManuallyModifiedAnnotationIds(ManualRedactions manualRedactions) {
if (manualRedactions == null) {
return new HashSet<>();
}
return Stream.concat(manualRedactions.getManualLegalBasisChanges()
.stream()
.map(ManualLegalBasisChange::getId), Stream.concat(manualRedactions.getImageRecategorizations()
.stream()
.map(ManualImageRecategorization::getId), Stream.concat(manualRedactions.getIdsToRemove()
.stream()
.map(IdRemoval::getId), manualRedactions.getForceRedacts().stream().map(ManualForceRedact::getId))))
.collect(Collectors.toSet());
}
public Image convert(RedactionLogEntry entry) {
Rectangle position = entry.getPositions().get(0);
return Image.builder()
.type(entry.getType())
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
.getY(), position.getWidth(), position.getHeight()))
.sectionNumber(entry.getSectionNumber())
.section(entry.getSection())
.page(position.getPage())
.hasTransparency(entry.isImageHasTransparency())
.build();
}
private void excludeExcludedPages(RedactionLog redactionLog, Set<Integer> excludedPages) {
if (excludedPages != null && !excludedPages.isEmpty()) {
redactionLog.getRedactionLogEntry().forEach(entry -> entry.getPositions().forEach(pos -> {
if (excludedPages.contains(pos.getPage())) {
entry.setExcluded(true);
}
}));
}
}
}

View File

@ -1,19 +1,10 @@
package com.iqser.red.service.redaction.v1.server.redaction.service; package com.iqser.red.service.redaction.v1.server.redaction.service;
import static com.iqser.red.service.configuration.v1.api.resource.DictionaryResource.GLOBAL_DOSSIER; import com.iqser.red.service.persistence.service.v1.api.model.data.configuration.Colors;
import com.iqser.red.service.persistence.service.v1.api.model.data.configuration.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement; import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import feign.FeignException; import feign.FeignException;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -25,6 +16,7 @@ import java.awt.Color;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@Slf4j @Slf4j
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
@ -32,37 +24,38 @@ public class DictionaryService {
private final DictionaryClient dictionaryClient; private final DictionaryClient dictionaryClient;
private final Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>(); private final Map<String, DictionaryRepresentation> dictionariesByDossierTemplate = new HashMap<>();
private final Map<String, DictionaryRepresentation> dictionariesByDossier = new HashMap<>(); private final Map<String, DictionaryRepresentation> dictionariesByDossier = new HashMap<>();
public DictionaryVersion updateDictionary(String ruleSetId, String dossierId) { public DictionaryVersion updateDictionary(String dossierTemplateId, String dossierId) {
long rulesetDictionaryVersion = dictionaryClient.getVersion(ruleSetId, GLOBAL_DOSSIER); log.info("Updating dictionary data for dossierTemplate {} and dossier {}", dossierTemplateId, dossierId);
var rulesetDictionary = dictionariesByRuleSets.get(ruleSetId); long dossierTemplateDictionaryVersion = dictionaryClient.getVersion(dossierTemplateId);
if (rulesetDictionary == null || rulesetDictionaryVersion > rulesetDictionary.getDictionaryVersion()) { var dossierTemplateDictionary = dictionariesByDossierTemplate.get(dossierTemplateId);
updateDictionaryEntry(ruleSetId, rulesetDictionaryVersion, GLOBAL_DOSSIER); if (dossierTemplateDictionary == null || dossierTemplateDictionaryVersion > dossierTemplateDictionary.getDictionaryVersion()) {
updateDictionaryEntry(dossierTemplateId, dossierTemplateDictionaryVersion, null);
} }
long dossierDictionaryVersion = dictionaryClient.getVersion(ruleSetId, dossierId); long dossierDictionaryVersion = dictionaryClient.getVersionForDossier(dossierId);
var dossierDictionary = dictionariesByDossier.get(dossierId); var dossierDictionary = dictionariesByDossier.get(dossierId);
if (dossierDictionary == null || dossierDictionaryVersion > dossierDictionary.getDictionaryVersion()) { if (dossierDictionary == null || dossierDictionaryVersion > dossierDictionary.getDictionaryVersion()) {
updateDictionaryEntry(ruleSetId, dossierDictionaryVersion, dossierId); updateDictionaryEntry(dossierTemplateId, dossierDictionaryVersion, dossierId);
} }
return DictionaryVersion.builder().rulesetVersion(rulesetDictionaryVersion).dossierVersion(dossierDictionaryVersion).build(); return DictionaryVersion.builder().dossierTemplateVersion(dossierTemplateDictionaryVersion).dossierVersion(dossierDictionaryVersion).build();
} }
public DictionaryIncrement getDictionaryIncrements(String ruleSetId, DictionaryVersion fromVersion, String dossierId) { public DictionaryIncrement getDictionaryIncrements(String dossierTemplateId, DictionaryVersion fromVersion, String dossierId) {
DictionaryVersion version = updateDictionary(ruleSetId, dossierId); DictionaryVersion version = updateDictionary(dossierTemplateId, dossierId);
Set<DictionaryIncrementValue> newValues = new HashSet<>(); Set<DictionaryIncrementValue> newValues = new HashSet<>();
List<DictionaryModel> dictionaryModels = dictionariesByRuleSets.get(ruleSetId).getDictionary(); List<DictionaryModel> dictionaryModels = dictionariesByDossierTemplate.get(dossierTemplateId).getDictionary();
dictionaryModels.forEach(dictionaryModel -> { dictionaryModels.forEach(dictionaryModel -> {
dictionaryModel.getEntries().forEach(dictionaryEntry -> { dictionaryModel.getEntries().forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion.getRulesetVersion()) { if (dictionaryEntry.getVersion() > fromVersion.getDossierTemplateVersion()) {
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive())); newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
} }
}); });
@ -83,35 +76,35 @@ public class DictionaryService {
} }
private void updateDictionaryEntry(String ruleSetId, long version, String dossierId) { private void updateDictionaryEntry(String dossierTemplateId, long version, String dossierId) {
try { try {
DictionaryRepresentation dictionaryRepresentation = new DictionaryRepresentation(); DictionaryRepresentation dictionaryRepresentation = new DictionaryRepresentation();
TypeResponse typeResponse = dictionaryClient.getAllTypes(ruleSetId, dossierId); var typeResponse = dossierId == null ? dictionaryClient.getAllTypesForDossierTemplate(dossierTemplateId) : dictionaryClient.getAllTypesForDossier(dossierId);
if (typeResponse != null && CollectionUtils.isNotEmpty(typeResponse.getTypes())) { if (typeResponse != null && CollectionUtils.isNotEmpty(typeResponse)) {
List<DictionaryModel> dictionary = typeResponse.getTypes() List<DictionaryModel> dictionary = typeResponse
.stream() .stream()
.map(t -> new DictionaryModel(t.getType(), t.getRank(), convertColor(t.getHexColor()), t.isCaseInsensitive(), t .map(t -> new DictionaryModel(t.getType(), t.getRank(), convertColor(t.getHexColor()), t.isCaseInsensitive(), t
.isHint(), t.isRecommendation(), convertEntries(t, dossierId), new HashSet<>(),dossierId.equals(GLOBAL_DOSSIER) ? false : true)) .isHint(), t.isRecommendation(), convertEntries(t.getId()), new HashSet<>(), dossierId != null))
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed()) .sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
.collect(Collectors.toList()); .collect(Collectors.toList());
dictionary.forEach(dm -> dictionaryRepresentation.getLocalAccessMap().put(dm.getType(), dm)); dictionary.forEach(dm -> dictionaryRepresentation.getLocalAccessMap().put(dm.getType(), dm));
Colors colors = dictionaryClient.getColors(ruleSetId); Colors colors = dictionaryClient.getColors(dossierTemplateId);
dictionaryRepresentation.setDefaultColor(convertColor(colors.getDefaultColor())); dictionaryRepresentation.setDefaultColor(convertColor(colors.getDefaultColor()));
dictionaryRepresentation.setRequestAddColor(convertColor(colors.getRequestAdd())); dictionaryRepresentation.setRequestAddColor(convertColor(colors.getRequestAdd()));
dictionaryRepresentation.setRequestRemoveColor(convertColor(colors.getRequestRemove())); dictionaryRepresentation.setRequestRemoveColor(convertColor(colors.getRequestRemove()));
dictionaryRepresentation.setNotRedactedColor(convertColor(colors.getNotRedacted())); dictionaryRepresentation.setNotRedactedColor(convertColor(colors.getNotRedacted()));
dictionaryRepresentation.setRuleSetId(ruleSetId); dictionaryRepresentation.setDossierTemplateId(dossierTemplateId);
dictionaryRepresentation.setDictionaryVersion(version); dictionaryRepresentation.setDictionaryVersion(version);
dictionaryRepresentation.setDictionary(dictionary); dictionaryRepresentation.setDictionary(dictionary);
if(dossierId.equals(GLOBAL_DOSSIER)) { if(dossierId == null) {
dictionariesByRuleSets.put(ruleSetId, dictionaryRepresentation); dictionariesByDossierTemplate.put(dossierTemplateId, dictionaryRepresentation);
} else { } else {
dictionariesByDossier.put(dossierId, dictionaryRepresentation); dictionariesByDossier.put(dossierId, dictionaryRepresentation);
} }
@ -123,26 +116,28 @@ public class DictionaryService {
} }
public void updateExternalDictionary(Dictionary dictionary, String ruleSetId) { public void updateExternalDictionary(Dictionary dictionary, String dossierTemplateId) {
dictionary.getDictionaryModels().forEach(dm -> { dictionary.getDictionaryModels().forEach(dm -> {
if (dm.isRecommendation() && !dm.getLocalEntries().isEmpty()) { if (dm.isRecommendation() && !dm.getLocalEntries().isEmpty()) {
dictionaryClient.addEntries(dm.getType(), ruleSetId, new ArrayList<>(dm.getLocalEntries()), false, GLOBAL_DOSSIER); dictionaryClient.addEntries(dm.getType(), new ArrayList<>(dm.getLocalEntries()), false);
long externalVersion = dictionaryClient.getVersion(ruleSetId, GLOBAL_DOSSIER); long externalVersion = dictionaryClient.getVersion(dossierTemplateId);
if (externalVersion == dictionary.getVersion().getRulesetVersion() + 1) { if (externalVersion == dictionary.getVersion().getDossierTemplateVersion() + 1) {
dictionary.getVersion().setRulesetVersion(externalVersion); dictionary.getVersion().setDossierTemplateVersion(externalVersion);
} }
} }
}); });
} }
private Set<DictionaryEntry> convertEntries(TypeResult t, String dossierId) { private Set<DictionaryEntry> convertEntries(String typeId) {
Set<DictionaryEntry> entries = new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId(), dossierId) var type = dictionaryClient.getDictionaryForType(typeId);
Set<DictionaryEntry> entries = new HashSet<>(type
.getEntries()); .getEntries());
if (t.isCaseInsensitive()) { if (type.isCaseInsensitive()) {
entries.forEach(entry -> entry.setValue(entry.getValue().toLowerCase(Locale.ROOT))); entries.forEach(entry -> entry.setValue(entry.getValue().toLowerCase(Locale.ROOT)));
} }
return entries; return entries;
@ -156,9 +151,9 @@ public class DictionaryService {
} }
public boolean isCaseInsensitiveDictionary(String type, String ruleSetId) { public boolean isCaseInsensitiveDictionary(String type, String dossierTemplateId) {
DictionaryModel dictionaryModel = dictionariesByRuleSets.get(ruleSetId).getLocalAccessMap().get(type); DictionaryModel dictionaryModel = dictionariesByDossierTemplate.get(dossierTemplateId).getLocalAccessMap().get(type);
if (dictionaryModel != null) { if (dictionaryModel != null) {
return dictionaryModel.isCaseInsensitive(); return dictionaryModel.isCaseInsensitive();
} }
@ -166,19 +161,19 @@ public class DictionaryService {
} }
public float[] getColor(String type, String ruleSetId) { public float[] getColor(String type, String dossierTemplateId) {
DictionaryModel model = dictionariesByRuleSets.get(ruleSetId).getLocalAccessMap().get(type); DictionaryModel model = dictionariesByDossierTemplate.get(dossierTemplateId).getLocalAccessMap().get(type);
if (model != null) { if (model != null) {
return model.getColor(); return model.getColor();
} }
return dictionariesByRuleSets.get(ruleSetId).getDefaultColor(); return dictionariesByDossierTemplate.get(dossierTemplateId).getDefaultColor();
} }
public boolean isHint(String type, String ruleSetId) { public boolean isHint(String type, String dossierTemplateId) {
DictionaryModel model = dictionariesByRuleSets.get(ruleSetId).getLocalAccessMap().get(type); DictionaryModel model = dictionariesByDossierTemplate.get(dossierTemplateId).getLocalAccessMap().get(type);
if (model != null) { if (model != null) {
return model.isHint(); return model.isHint();
} }
@ -186,9 +181,9 @@ public class DictionaryService {
} }
public boolean isRecommendation(String type, String ruleSetId) { public boolean isRecommendation(String type, String dossierTemplateId) {
DictionaryModel model = dictionariesByRuleSets.get(ruleSetId).getLocalAccessMap().get(type); DictionaryModel model = dictionariesByDossierTemplate.get(dossierTemplateId).getLocalAccessMap().get(type);
if (model != null) { if (model != null) {
return model.isRecommendation(); return model.isRecommendation();
} }
@ -196,12 +191,12 @@ public class DictionaryService {
} }
public Dictionary getDeepCopyDictionary(String ruleSetId, String dossierId) { public Dictionary getDeepCopyDictionary(String dossierTemplateId, String dossierId) {
List<DictionaryModel> copy = new ArrayList<>(); List<DictionaryModel> copy = new ArrayList<>();
var rulesetRepresentation = dictionariesByRuleSets.get(ruleSetId); var dossierTemplateRepresentation = dictionariesByDossierTemplate.get(dossierTemplateId);
rulesetRepresentation.getDictionary().forEach(dm -> { dossierTemplateRepresentation.getDictionary().forEach(dm -> {
copy.add(SerializationUtils.clone(dm)); copy.add(SerializationUtils.clone(dm));
}); });
@ -215,25 +210,25 @@ public class DictionaryService {
dossierDictionaryVersion = dossierRepresentation.getDictionaryVersion(); dossierDictionaryVersion = dossierRepresentation.getDictionaryVersion();
} }
return new Dictionary(copy, DictionaryVersion.builder().rulesetVersion(rulesetRepresentation.getDictionaryVersion()).dossierVersion(dossierDictionaryVersion).build()); return new Dictionary(copy.stream().sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed()).collect(Collectors.toList()), DictionaryVersion.builder().dossierTemplateVersion(dossierTemplateRepresentation.getDictionaryVersion()).dossierVersion(dossierDictionaryVersion).build());
} }
public float[] getRequestRemoveColor(String ruleSetId) { public float[] getRequestRemoveColor(String dossierTemplateId) {
return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor(); return dictionariesByDossierTemplate.get(dossierTemplateId).getRequestRemoveColor();
} }
public float[] getNotRedactedColor(String ruleSetId) { public float[] getNotRedactedColor(String dossierTemplateId) {
return dictionariesByRuleSets.get(ruleSetId).getNotRedactedColor(); return dictionariesByDossierTemplate.get(dossierTemplateId).getNotRedactedColor();
} }
public float[] getRequestAddColor(String ruleSetId) { public float[] getRequestAddColor(String dossierTemplateId) {
return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor(); return dictionariesByDossierTemplate.get(dossierTemplateId).getRequestAddColor();
} }
} }

View File

@ -1,6 +1,5 @@
package com.iqser.red.service.redaction.v1.server.redaction.service; package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.redaction.v1.server.client.RulesClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException; import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section; import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
@ -28,14 +27,14 @@ public class DroolsExecutionService {
private final Map<String, KieContainer> kieContainers = new HashMap<>(); private final Map<String, KieContainer> kieContainers = new HashMap<>();
private final Map<String, Long> rulesVersionPerRuleSetId = new HashMap<>(); private final Map<String, Long> rulesVersionPerDossierTemplateId = new HashMap<>();
public KieContainer getKieContainer(String ruleSetId) { public KieContainer getKieContainer(String dossierTemplateId) {
KieContainer container = kieContainers.get(ruleSetId); KieContainer container = kieContainers.get(dossierTemplateId);
if (container == null) { if (container == null) {
return createOrUpdateKieContainer(ruleSetId); return createOrUpdateKieContainer(dossierTemplateId);
} else { } else {
return container; return container;
} }
@ -55,43 +54,43 @@ public class DroolsExecutionService {
} }
public KieContainer updateRules(String ruleSetId) { public KieContainer updateRules(String dossierTemplateId) {
long version = rulesClient.getVersion(ruleSetId); long version = rulesClient.getVersion(dossierTemplateId);
Long rulesVersion = rulesVersionPerRuleSetId.get(ruleSetId); Long rulesVersion = rulesVersionPerDossierTemplateId.get(dossierTemplateId);
if (rulesVersion == null) { if (rulesVersion == null) {
rulesVersion = -1L; rulesVersion = -1L;
} }
if (version > rulesVersion.longValue()) { if (version > rulesVersion) {
rulesVersionPerRuleSetId.put(ruleSetId, version); rulesVersionPerDossierTemplateId.put(dossierTemplateId, version);
return createOrUpdateKieContainer(ruleSetId); return createOrUpdateKieContainer(dossierTemplateId);
} }
return getKieContainer(ruleSetId); return getKieContainer(dossierTemplateId);
} }
private KieContainer createOrUpdateKieContainer(String ruleSetId) { private KieContainer createOrUpdateKieContainer(String dossierTemplateId) {
try { try {
RulesResponse rules = rulesClient.getRules(ruleSetId); var rules = rulesClient.getRules(dossierTemplateId);
if (rules == null || StringUtils.isEmpty(rules.getRules())) { if (rules == null || StringUtils.isEmpty(rules.getValue())) {
throw new RuntimeException("Rules cannot be empty."); throw new RuntimeException("Rules cannot be empty.");
} }
KieServices kieServices = KieServices.Factory.get(); KieServices kieServices = KieServices.Factory.get();
KieModule kieModule = getKieModule(ruleSetId, rules.getRules(), kieServices); KieModule kieModule = getKieModule(dossierTemplateId, rules.getValue(), kieServices);
var container = kieContainers.get(ruleSetId); var container = kieContainers.get(dossierTemplateId);
if (container != null) { if (container != null) {
container.updateToVersion(kieModule.getReleaseId()); container.updateToVersion(kieModule.getReleaseId());
return container; return container;
} }
container = kieServices.newKieContainer(kieModule.getReleaseId()); container = kieServices.newKieContainer(kieModule.getReleaseId());
kieContainers.put(ruleSetId, container); kieContainers.put(dossierTemplateId, container);
return container; return container;
} catch (Exception e) { } catch (Exception e) {
throw new RulesValidationException("Could not update rules: " + e.getMessage(), e); throw new RulesValidationException("Could not update rules: " + e.getMessage(), e);
@ -100,11 +99,11 @@ public class DroolsExecutionService {
} }
private KieModule getKieModule(String ruleSetId, String rules, KieServices kieServices) { private KieModule getKieModule(String dossierTemplateId, String rules, KieServices kieServices) {
KieFileSystem kieFileSystem = kieServices.newKieFileSystem(); KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
InputStream input = new ByteArrayInputStream(rules.getBytes(StandardCharsets.UTF_8)); InputStream input = new ByteArrayInputStream(rules.getBytes(StandardCharsets.UTF_8));
kieFileSystem.write("src/main/resources/drools/rules" + ruleSetId + ".drl", kieServices.getResources() kieFileSystem.write("src/main/resources/drools/rules" + dossierTemplateId + ".drl", kieServices.getResources()
.newInputStreamResource(input)); .newInputStreamResource(input));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem); KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll(); kieBuilder.buildAll();
@ -122,13 +121,13 @@ public class DroolsExecutionService {
} }
public long getRulesVersion(String ruleSetId) { public long getRulesVersion(String dossierTemplateId) {
Long rulesVersion = rulesVersionPerRuleSetId.get(ruleSetId); Long rulesVersion = rulesVersionPerDossierTemplateId.get(dossierTemplateId);
if (rulesVersion == null) { if (rulesVersion == null) {
return -1; return -1;
} }
return rulesVersion.longValue(); return rulesVersion;
} }
} }

View File

@ -1,64 +1,143 @@
package com.iqser.red.service.redaction.v1.server.redaction.service; package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; import java.util.ArrayList;
import com.iqser.red.service.redaction.v1.model.ManualRedactions; import java.util.HashMap;
import com.iqser.red.service.redaction.v1.model.Point; import java.util.HashSet;
import com.iqser.red.service.redaction.v1.model.Rectangle; import java.util.List;
import com.iqser.red.service.redaction.v1.model.SectionArea; import java.util.Map;
import com.iqser.red.service.redaction.v1.server.classification.model.*; import java.util.Set;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.redaction.model.*; import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import org.apache.commons.codec.binary.Base64;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer; import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import java.util.*; import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import java.util.concurrent.atomic.AtomicInteger; import com.iqser.red.service.redaction.v1.model.Engine;
import java.util.stream.Collectors; import com.iqser.red.service.redaction.v1.model.ManualImageRecategorization;
import java.util.stream.Stream; import com.iqser.red.service.redaction.v1.model.Status;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
public class EntityRedactionService { public class EntityRedactionService {
private final DictionaryService dictionaryService; private final RedactionServiceSettings redactionServiceSettings;
private final DroolsExecutionService droolsExecutionService; private final DroolsExecutionService droolsExecutionService;
private final SurroundingWordsService surroundingWordsService; private final SurroundingWordsService surroundingWordsService;
public void processDocument(Document classifiedDoc, String ruleSetId, ManualRedactions manualRedactions, String dossierId) { public PageEntities findEntities(Dictionary dictionary, List<SectionText> sectionTexts, KieContainer kieContainer,
AnalyzeRequest analyzeRequest, NerEntities nerEntities) {
dictionaryService.updateDictionary(ruleSetId, dossierId); Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
KieContainer container = droolsExecutionService.updateRules(ruleSetId); Set<Entity> entities = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, false, null, imagesPerPage, nerEntities);
long rulesVersion = droolsExecutionService.getRulesVersion(ruleSetId);
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(ruleSetId, dossierId);
Set<Entity> documentEntities = new HashSet<>(findEntities(classifiedDoc, container, manualRedactions, dictionary, false, null));
if (dictionary.hasLocalEntries()) { if (dictionary.hasLocalEntries()) {
Map<Integer, Set<Entity>> hintsPerSectionNumber = getHintsPerSection(entities, dictionary);
Set<Entity> foundByLocal = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, true, hintsPerSectionNumber, imagesPerPage, nerEntities);
EntitySearchUtils.addEntitiesWithHigherRank(entities, foundByLocal, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
Map<Integer, Set<Entity>> hintsPerSectionNumber = new HashMap<>(); Map<Integer, List<Entity>> entitiesPerPage = convertToEnititesPerPage(entities);
documentEntities.stream().forEach(entity -> { return new PageEntities(entitiesPerPage, imagesPerPage);
if (dictionary.isHint(entity.getType()) && entity.isDictionaryEntry()) { }
hintsPerSectionNumber.computeIfAbsent(entity.getSectionNumber(), (x) -> new HashSet<>())
.add(entity);
public Set<Entity> findEntities(List<SectionText> reanalysisSections, Dictionary dictionary,
KieContainer kieContainer, AnalyzeRequest analyzeRequest, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
Map<Integer, Set<Image>> imagesPerPage, NerEntities nerEntities) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection
.getSectionNumber(), dictionary, local, nerEntities, reanalysisSection.getCellStarts());
if (reanalysisSection.getCellStarts() != null && !reanalysisSection.getCellStarts().isEmpty()) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
if (!local && reanalysisSection.getImages() != null && !reanalysisSection.getImages()
.isEmpty() && analyzeRequest.getManualRedactions() != null && analyzeRequest.getManualRedactions()
.getImageRecategorizations() != null) {
for (Image image : reanalysisSection.getImages()) {
String imageId = IdBuilder.buildId(image.getPosition(), image.getPage());
for (ManualImageRecategorization imageRecategorization : analyzeRequest.getManualRedactions()
.getImageRecategorizations()) {
if (imageRecategorization.getStatus().equals(Status.APPROVED) && imageRecategorization.getId()
.equals(imageId)) {
image.setType(imageRecategorization.getType());
}
}
}
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream
.concat(entities.stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber())
.stream())
.collect(Collectors.toSet()) : entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.fileAttributes(analyzeRequest.getFileAttributes())
.build(), reanalysisSection.getSearchableText()));
}
Set<Entity> entities = new HashSet<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
EntitySearchUtils.removeEntitiesContainedInLarger(analysedSection.getEntities());
entities.addAll(analysedSection.getEntities());
if (!local) {
for (Image image : analysedSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
addLocalValuesToDictionary(analysedSection, dictionary);
} }
}); });
Set<Entity> foundByLocal = findEntities(classifiedDoc, container, manualRedactions, dictionary, true, hintsPerSectionNumber); return entities;
EntitySearchUtils.addEntitiesWithHigherRank(documentEntities, foundByLocal, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(documentEntities);
} }
for (Entity entity : documentEntities) {
private Map<Integer, List<Entity>> convertToEnititesPerPage(Set<Entity> entities) {
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>(); Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
@ -66,80 +145,40 @@ public class EntityRedactionService {
} }
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) { for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
classifiedDoc.getEntities() entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd(), entity.isDossierDictionaryEntry())); .getStart(), entity.getEnd(), entity.isDossierDictionaryEntry(), entity.getEngines(), entity.getReferences()));
} }
} }
return entitiesPerPage;
dictionaryService.updateExternalDictionary(dictionary, ruleSetId);
classifiedDoc.setDictionaryVersion(dictionary.getVersion());
classifiedDoc.setRulesVersion(rulesVersion);
} }
private Set<Entity> findEntities(Document classifiedDoc, KieContainer kieContainer, private Map<Integer, Set<Entity>> getHintsPerSection(Set<Entity> entities, Dictionary dictionary) {
ManualRedactions manualRedactions, Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
Set<Entity> documentEntities = new HashSet<>(); Map<Integer, Set<Entity>> hintsPerSectionNumber = new HashMap<>();
entities.stream().forEach(entity -> {
AtomicInteger sectionNumber = new AtomicInteger(1); if (dictionary.isHint(entity.getType()) && entity.isDictionaryEntry()) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>(); hintsPerSectionNumber.computeIfAbsent(entity.getSectionNumber(), (x) -> new HashSet<>()).add(entity);
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
List<Table> tables = paragraph.getTables();
for (Table table : tables) {
if (table.getColCount() == 2) {
sectionSearchableTextPairs.addAll(processTableAsOneText(classifiedDoc, table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
} else {
sectionSearchableTextPairs.addAll(processTablePerRow(classifiedDoc, table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
} }
sectionNumber.incrementAndGet(); });
} return hintsPerSectionNumber;
sectionSearchableTextPairs.add(processText(classifiedDoc, paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph
.getHeadline(), manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, paragraph
.getImages()));
sectionNumber.incrementAndGet();
} }
for (Header header : classifiedDoc.getHeaders()) {
sectionSearchableTextPairs.add(processText(classifiedDoc, header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>()));
sectionNumber.incrementAndGet();
}
for (Footer footer : classifiedDoc.getFooters()) { private void addLocalValuesToDictionary(Section analysedSection, Dictionary dictionary) {
sectionSearchableTextPairs.add(processText(classifiedDoc, footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>()));
sectionNumber.incrementAndGet();
}
for (UnclassifiedText unclassifiedText : classifiedDoc.getUnclassifiedTexts()) { analysedSection.getLocalDictionaryAdds().keySet().forEach(key -> {
sectionSearchableTextPairs.add(processText(classifiedDoc, unclassifiedText.getSearchableText(), unclassifiedText
.getTextBlocks(), "", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>()));
sectionNumber.incrementAndGet();
}
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
documentEntities.addAll(analysedRowSection.getEntities());
for (Image image : analysedRowSection.getImages()) {
classifiedDoc.getImages().computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> {
if (dictionary.isRecommendation(key)) { if (dictionary.isRecommendation(key)) {
analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> { analysedSection.getLocalDictionaryAdds().get(key).forEach(value -> {
if (!dictionary.containsValue(key, value)) { if (!dictionary.containsValue(key, value)) {
dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value); dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value);
} }
}); });
} else { } else {
analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> { analysedSection.getLocalDictionaryAdds().get(key).forEach(value -> {
if (dictionary.getLocalAccessMap().get(key) == null) { if (dictionary.getLocalAccessMap().get(key) == null) {
log.warn("Dictionary {} is null", key); log.warn("Dictionary {} is null", key);
@ -153,214 +192,12 @@ public class EntityRedactionService {
}); });
} }
}); });
});
return documentEntities;
} }
private List<SectionSearchableTextPair> processTablePerRow(Document classifiedDoc, Table table, private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
ManualRedactions manualRedactions, Dictionary dictionary, boolean local, NerEntities nerEntities,
AtomicInteger sectionNumber, Dictionary dictionary, List<Integer> cellstarts) {
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
SectionText sectionText = new SectionText();
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage());
sectionText.getSectionAreas().add(sectionArea);
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
int cellStart = start;
if (!cell.isHeaderCell()) {
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
sectionArea.setHeader(headerName);
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
}
for (TextBlock textBlock : cell.getTextBlocks()) {
// TODO avoid cell overlap merging.
searchableRow.addAll(textBlock.getSequences());
}
cellStarts.add(cellStart);
start = start + cell.toString().trim().length() + 1;
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream
.concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
.collect(Collectors.toSet()) : rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString())
.headline(table.getHeadline())
.sectionNumber(sectionNumber.intValue())
.tabularData(tabularData)
.searchableText(searchableRow)
.dictionary(dictionary)
.build(), searchableRow));
if (!local) {
sectionText.setText(searchableRow.toString());
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
sectionText.setTabularData(tabularData);
sectionText.setCellStarts(cellStarts);
classifiedDoc.getSectionText().add(sectionText);
}
sectionNumber.incrementAndGet();
}
return sectionSearchableTextPairs;
}
private List<SectionSearchableTextPair> processTableAsOneText(Document classifiedDoc, Table table,
ManualRedactions manualRedactions,
AtomicInteger sectionNumber, Dictionary dictionary,
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
SearchableText entireTableText = new SearchableText();
SectionText sectionText = new SectionText();
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
if (!local) {
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage());
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
sectionText.getSectionAreas().add(sectionArea);
}
for (TextBlock textBlock : cell.getTextBlocks()) {
entireTableText.addAll(textBlock.getSequences());
}
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
}
}
Set<Entity> rowEntities = findEntities(entireTableText, table.getHeadline(), sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(rowEntities, entireTableText, dictionary);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream
.concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
.collect(Collectors.toSet()) : rowEntities)
.text(entireTableText.getAsStringWithLinebreaks())
.searchText(entireTableText.toString())
.headline(table.getHeadline())
.sectionNumber(sectionNumber.intValue())
.searchableText(entireTableText)
.dictionary(dictionary)
.build(), entireTableText));
if (!local) {
sectionText.setText(entireTableText.toString());
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
classifiedDoc.getSectionText().add(sectionText);
}
return sectionSearchableTextPairs;
}
private SectionSearchableTextPair processText(Document classifiedDoc, SearchableText searchableText,
List<TextBlock> paragraphTextBlocks, String headline,
ManualRedactions manualRedactions, AtomicInteger sectionNumber,
Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
List<PdfImage> images) {
if (!local) {
SectionText sectionText = new SectionText();
for (TextBlock paragraphTextBlock : paragraphTextBlocks) {
SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock
.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage());
sectionText.getSectionAreas().add(sectionArea);
}
sectionText.setText(searchableText.toString());
sectionText.setHeadline(headline);
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(false);
sectionText.setImages(images.stream()
.map(image -> convert(image, sectionNumber.intValue(), headline))
.collect(Collectors.toSet()));
sectionText.setTextBlocks(paragraphTextBlocks);
classifiedDoc.getSectionText().add(sectionText);
}
addSectionToManualRedactions(paragraphTextBlocks, manualRedactions, headline, sectionNumber.intValue());
Set<Entity> entities = findEntities(searchableText, headline, sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
return new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream
.concat(entities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
.collect(Collectors.toSet()) : entities)
.text(searchableText.getAsStringWithLinebreaks())
.searchText(searchableText.toString())
.headline(headline)
.sectionNumber(sectionNumber.intValue())
.searchableText(searchableText)
.dictionary(dictionary)
.images(images.stream()
.map(image -> convert(image, sectionNumber.intValue(), headline))
.collect(Collectors.toSet()))
.build(), searchableText);
}
public Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
Dictionary dictionary, boolean local) {
Set<Entity> found = new HashSet<>(); Set<Entity> found = new HashSet<>();
String searchableString = searchableText.toString(); String searchableString = searchableText.toString();
@ -371,47 +208,52 @@ public class EntityRedactionService {
String lowercaseInputString = searchableString.toLowerCase(); String lowercaseInputString = searchableString.toLowerCase();
for (DictionaryModel model : dictionary.getDictionaryModels()) { for (DictionaryModel model : dictionary.getDictionaryModels()) {
if (model.isCaseInsensitive()) { if (model.isCaseInsensitive()) {
found.addAll(EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local, model.isDossierDictionary())); EntitySearchUtils.addOrAddEngine(found, EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model
.getType(), headline, sectionNumber, !local, model.isDossierDictionary(), Engine.DICTIONARY));
} else { } else {
found.addAll(EntitySearchUtils.find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local, model.isDossierDictionary())); EntitySearchUtils.addOrAddEngine(found, EntitySearchUtils.find(searchableString, model.getValues(local), model
.getType(), headline, sectionNumber, !local, model.isDossierDictionary(), Engine.DICTIONARY));
} }
} }
if (!local) {
Map<String, Set<String>> nerValuesPerType = getNerValues(sectionNumber, nerEntities, cellstarts);
nerValuesPerType.entrySet().forEach(entry -> {
EntitySearchUtils.addOrAddEngine(found, EntitySearchUtils.find(searchableString, entry.getValue(), entry
.getKey(), headline, sectionNumber, false, false, Engine.NER));
});
}
return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary); return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary);
} }
private void addSectionToManualRedactions(List<TextBlock> textBlocks, ManualRedactions manualRedactions, private Map<String, Set<String>> getNerValues(int sectionNumber, NerEntities nerEntities,
String section, int sectionNumber) { List<Integer> cellstarts) {
if (manualRedactions == null || manualRedactions.getEntriesToAdd().isEmpty()) { Map<String, Set<String>> nerValuesPerType = new HashMap<>();
return;
}
for (TextBlock textBlock : textBlocks) { if (redactionServiceSettings.isEnableEntityRecognition() && nerEntities.getResult()
for (ManualRedactionEntry manualRedactionEntry : manualRedactions.getEntriesToAdd()) { .containsKey(sectionNumber)) {
for (Rectangle rectangle : manualRedactionEntry.getPositions()) { nerEntities.getResult().get(sectionNumber).forEach(res -> {
if (textBlock.contains(rectangle)) { if (cellstarts == null || cellstarts.isEmpty()) {
manualRedactionEntry.setSection(section); nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>())
manualRedactionEntry.setSectionNumber(sectionNumber); .add(new String(Base64.decodeBase64(res.getValue().getBytes())));
} else {
boolean intersectsCellStart = false;
for (Integer cellStart : cellstarts) {
if (res.getStartOffset() < cellStart && cellStart < res.getEndOffset()) {
intersectsCellStart = true;
} }
} }
if (!intersectsCellStart) {
nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>())
.add(new String(Base64.decodeBase64(res.getValue().getBytes())));
} }
} }
});
} }
return nerValuesPerType;
private Image convert(PdfImage pdfImage, int sectionNumber, String headline) {
return Image.builder()
.type(pdfImage.getImageType().equals(ImageType.OTHER) ? "image" : pdfImage.getImageType()
.name()
.toLowerCase(Locale.ROOT))
.position(pdfImage.getPosition())
.sectionNumber(sectionNumber)
.section(headline)
.page(pdfImage.getPage())
.build();
} }
} }

View File

@ -0,0 +1,51 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.stream.Collectors;
import org.apache.commons.codec.binary.Base64;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.server.client.EntityRecognitionClient;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionSection;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class NerAnalyserService {
private final RedactionStorageService redactionStorageService;
private final EntityRecognitionClient entityRecognitionClient;
private final RedactionServiceSettings redactionServiceSettings;
public void computeNerEntities(String dossierId, String fileId) {
if (redactionServiceSettings.isEnableEntityRecognition()) {
var text = redactionStorageService.getText(dossierId, fileId);
long start = System.currentTimeMillis();
var nerRequest = EntityRecognitionRequest.builder()
.data(text.getSectionTexts()
.stream()
.map(sectionText -> new EntityRecognitionSection(sectionText.getSectionNumber(), new String(Base64
.encodeBase64(sectionText
.getText().getBytes()))))
.collect(Collectors.toList()))
.build();
var nerResponse = entityRecognitionClient.findAuthors(nerRequest);
log.info("Computing NER entities took: {} ms for dossierId {} and fileId {}", System.currentTimeMillis() - start, dossierId, fileId);
redactionStorageService.storeObject(dossierId, fileId, FileType.NER_ENTITIES, nerResponse);
}
}
}

View File

@ -1,296 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Slf4j
@Service
@RequiredArgsConstructor
public class ReanalyzeService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
private final SurroundingWordsService surroundingWordsService;
private final EntityRedactionService entityRedactionService;
private final RedactionLogCreatorService redactionLogCreatorService;
private final RedactionStorageService redactionStorageService;
private final PdfSegmentationService pdfSegmentationService;
private final RedactionChangeLogService redactionChangeLogService;
private final AnalyzeResponseService analyzeResponseService;
private final LegalBasisClient legalBasisClient;
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var pageCount = 0;
Document classifiedDoc;
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest
.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
pageCount = classifiedDoc.getPages().size();
} catch (Exception e) {
throw new RedactionException(e);
}
log.info("Document structure analysis successful, starting redaction analysis...");
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions(), analyzeRequest
.getProjectId());
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
.getRuleSetId());
log.info("Redaction analysis successful...");
var legalBasis = legalBasisClient.getLegalBasisMapping(analyzeRequest.getRuleSetId());
var redactionLog = new RedactionLog(classifiedDoc.getRedactionLogEntities(),legalBasis,
analyzeRequest.getRuleSetId(),
classifiedDoc.getDictionaryVersion().getRulesetVersion(),
classifiedDoc.getRulesVersion(),
classifiedDoc.getDictionaryVersion().getDossierVersion(),
legalBasisClient.getVersion(analyzeRequest.getRuleSetId()));
log.info("Analyzed with rules {} and dictionary {} for ruleSet: {}", classifiedDoc.getRulesVersion(), classifiedDoc
.getDictionaryVersion(), analyzeRequest.getRuleSetId());
// first create changelog - this only happens when we migrate files analyzed via the old process and we don't want to loose changeLog data
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
// store redactionLog
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.TEXT, new Text(pageCount, classifiedDoc
.getSectionText()));
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc
.getSectionGrid());
long duration = System.currentTimeMillis() - startTime;
return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), duration, pageCount, redactionLog, changeLog);
}
@SneakyThrows
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
// not yet ready for reanalysis
if (redactionLog == null || text == null || text.getNumberOfPages() == 0) {
return analyze(analyzeRequest);
}
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), new DictionaryVersion(redactionLog
.getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getProjectId());
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions());
Map<String, List<Comment>> comments = null;
Set<ManualRedactionEntry> manualAdds = null;
if (analyzeRequest.getManualRedactions() != null) {
// TODO comments will be removed from redactionLog, so we ignore this first.
comments = analyzeRequest.getManualRedactions().getComments();
manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd();
}
Set<Integer> sectionsToReanalyse = new HashSet<>();
Map<Integer, Set<Image>> imageEntries = new HashMap<>();
for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) {
if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) {
sectionsToReanalyse.add(entry.getSectionNumber());
}
if (entry.isImage() || entry.getType().equals("image")) {
imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry));
}
}
for (SectionText sectionText : text.getSectionTexts()) {
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
sectionsToReanalyse.add(sectionText.getSectionNumber());
}
if (manualAdds != null) {
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
for (ManualRedactionEntry manualAdd : manualAdds) {
for (Rectangle manualPosition : manualAdd.getPositions()) {
if (sectionArea.contains(manualPosition)) {
manualAdd.setSection(sectionText.getHeadline());
manualAdd.setSectionNumber(sectionText.getSectionNumber());
}
}
}
}
}
}
log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest);
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement);
}
List<SectionText> reanalysisSections = new ArrayList<>();
for (SectionText sectionText : text.getSectionTexts()) {
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
reanalysisSections.add(sectionText);
}
}
//--
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId(), analyzeRequest.getProjectId());
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
if (reanalysisSection.getCellStarts() != null) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.build(), reanalysisSection.getSearchableText()));
}
Set<Entity> entities = new HashSet<>();
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
entities.addAll(analysedRowSection.getEntities());
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
for (Image image : analysedRowSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
});
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
.add(entityPositionSequence);
}
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd(), entity.isDossierDictionaryEntry()));
}
}
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= text.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest.getManualRedactions(), page, analyzeRequest
.getRuleSetId()));
}
if (imagesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest.getManualRedactions(), page, analyzeRequest
.getRuleSetId()));
}
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest
.getRuleSetId()));
}
redactionLog.getRedactionLogEntry()
.removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()));
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement);
}
private AnalyzeResult finalizeAnalysis(@RequestBody AnalyzeRequest analyzeRequest, long startTime,
RedactionLog redactionLog, Text text,
DictionaryIncrement dictionaryIncrement) {
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getRulesetVersion());
redactionLog.setDossierDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getDossierVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
long duration = System.currentTimeMillis() - startTime;
return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), duration, text
.getNumberOfPages(), redactionLog, changeLog);
}
private Set<String> getForceAndRemoveIds(ManualRedactions manualRedactions) {
if (manualRedactions == null) {
return new HashSet<>();
}
return Stream.concat(manualRedactions.getIdsToRemove()
.stream()
.map(IdRemoval::getId), manualRedactions.getForceRedacts().stream().map(ManualForceRedact::getId))
.collect(Collectors.toSet());
}
public Image convert(RedactionLogEntry entry) {
Rectangle position = entry.getPositions().get(0);
return Image.builder()
.type(entry.getType())
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
.getY(), position.getWidth(), position.getHeight()))
.sectionNumber(entry.getSectionNumber())
.section(entry.getSection())
.page(position.getPage())
.build();
}
}

View File

@ -1,19 +1,25 @@
package com.iqser.red.service.redaction.v1.server.redaction.service; package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.file.management.v1.api.model.FileType; import java.time.OffsetDateTime;
import com.iqser.red.service.redaction.v1.model.ChangeType; import java.util.ArrayList;
import com.iqser.red.service.redaction.v1.model.RedactionChangeLog; import java.util.HashMap;
import com.iqser.red.service.redaction.v1.model.RedactionChangeLogEntry; import java.util.HashSet;
import com.iqser.red.service.redaction.v1.model.RedactionLog; import java.util.List;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; import java.util.Map;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import java.util.Set;
import lombok.RequiredArgsConstructor; import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import java.util.ArrayList; import com.iqser.red.service.redaction.v1.model.Change;
import java.util.List; import com.iqser.red.service.redaction.v1.model.ChangeType;
import java.util.stream.Collectors; import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogChanges;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@Service @Service
@ -22,72 +28,80 @@ public class RedactionChangeLogService {
private final RedactionStorageService redactionStorageService; private final RedactionStorageService redactionStorageService;
public RedactionChangeLog createAndStoreChangeLog(String projectId, String fileId, RedactionLog currentRedactionLog) {
try { public RedactionLogChanges computeChanges(String dossierId, String fileId, RedactionLog currentRedactionLog) {
RedactionLog previousRedactionLog = redactionStorageService.getRedactionLog(projectId, fileId);
var changeLog = createChangeLog(currentRedactionLog, previousRedactionLog);
redactionStorageService.storeObject(projectId, fileId, FileType.REDACTION_CHANGELOG, changeLog);
return changeLog;
} catch (Exception e) {
log.debug("Previous redaction log not available");
return null;
}
} long start = System.currentTimeMillis();
private RedactionChangeLog createChangeLog(RedactionLog currentRedactionLog, RedactionLog previousRedactionLog) {
RedactionLog previousRedactionLog = redactionStorageService.getRedactionLog(dossierId, fileId);
if (previousRedactionLog == null) { if (previousRedactionLog == null) {
return null; currentRedactionLog.getRedactionLogEntry().forEach(entry -> {
entry.getChanges().add(new Change(ChangeType.ADDED, OffsetDateTime.now()));
});
return new RedactionLogChanges(currentRedactionLog, false);
} }
List<RedactionLogEntry> added = new ArrayList<>(currentRedactionLog.getRedactionLogEntry()); List<RedactionLogEntry> notRemovedPreviousEntries = previousRedactionLog.getRedactionLogEntry()
added.removeAll(previousRedactionLog.getRedactionLogEntry()); .stream()
.filter(entry -> !entry.getChanges()
.get(entry.getChanges().size() - 1)
.getType()
.equals(ChangeType.REMOVED))
.collect(Collectors.toList());
List<RedactionLogEntry> removed = new ArrayList<>(previousRedactionLog.getRedactionLogEntry()); Set<RedactionLogEntry> added = new HashSet<>(currentRedactionLog.getRedactionLogEntry());
added.removeAll(notRemovedPreviousEntries);
Set<RedactionLogEntry> removed = new HashSet<>(notRemovedPreviousEntries);
removed.removeAll(currentRedactionLog.getRedactionLogEntry()); removed.removeAll(currentRedactionLog.getRedactionLogEntry());
List<RedactionChangeLogEntry> changeLogEntries = added.stream() Map<String, RedactionLogEntry> addedIds = new HashMap<>();
.map(entry -> convert(entry, ChangeType.ADDED)) added.forEach(entry -> {
.collect(Collectors.toList()); addedIds.put(entry.getId(), entry);
changeLogEntries.addAll(removed.stream() });
.map(entry -> convert(entry, ChangeType.REMOVED))
.collect(Collectors.toList()));
return new RedactionChangeLog(changeLogEntries, currentRedactionLog.getDictionaryVersion(), currentRedactionLog.getRulesVersion(), currentRedactionLog Set<String> removedIds = new HashSet<>();
.getRuleSetId()); removed.forEach(entry -> {
removedIds.add(entry.getId());
});
List<RedactionLogEntry> newRedactionLogEntries = previousRedactionLog.getRedactionLogEntry();
List<RedactionLogEntry> toRemove = new ArrayList<>();
newRedactionLogEntries.forEach(entry -> {
if (removedIds.contains(entry.getId()) && addedIds.containsKey(entry.getId())) {
List<Change> changes = entry.getChanges();
changes.add(new Change(ChangeType.CHANGED, OffsetDateTime.now()));
var newEntry = addedIds.get(entry.getId());
newEntry.setChanges(changes);
addedIds.put(entry.getId(), newEntry);
toRemove.add(entry);
} else if (removedIds.contains(entry.getId())) {
entry.getChanges().add(new Change(ChangeType.REMOVED, OffsetDateTime.now()));
} else if (addedIds.containsKey(entry.getId())) {
List<Change> changes = entry.getChanges();
changes.add(new Change(ChangeType.ADDED, OffsetDateTime.now()));
var newEntry = addedIds.get(entry.getId());
newEntry.setChanges(changes);
addedIds.put(entry.getId(), newEntry);
toRemove.add(entry);
} }
});
newRedactionLogEntries.removeAll(toRemove);
private RedactionChangeLogEntry convert(RedactionLogEntry entry, ChangeType changeType) { addedIds.forEach((k, v) -> {
if(v.getChanges().isEmpty()) {
v.getChanges().add(new Change(ChangeType.ADDED, OffsetDateTime.now()));
}
newRedactionLogEntries.add(v);
});
return RedactionChangeLogEntry.builder() currentRedactionLog.setRedactionLogEntry(newRedactionLogEntries);
.id(entry.getId())
.type(entry.getType()) log.info("Change computation took: {}", System.currentTimeMillis() - start);
.value(entry.getValue()) return new RedactionLogChanges(currentRedactionLog, !addedIds.isEmpty() || !removedIds.isEmpty());
.reason(entry.getReason())
.matchedRule(entry.getMatchedRule())
.legalBasis(entry.getLegalBasis())
.redacted(entry.isRedacted())
.isHint(entry.isHint())
.isRecommendation(entry.isRecommendation())
.section(entry.getSection())
.color(entry.getColor())
.positions(entry.getPositions())
.sectionNumber(entry.getSectionNumber())
.manual(entry.isManual())
.status(entry.getStatus())
.manualRedactionType(entry.getManualRedactionType())
.isDictionaryEntry(entry.isDictionaryEntry())
.textBefore(entry.getTextBefore())
.textAfter(entry.getTextAfter())
.comments(entry.getComments())
.changeType(changeType)
.isDossierDictionaryEntry(entry.isDossierDictionaryEntry())
.build();
} }
} }

View File

@ -1,22 +1,5 @@
package com.iqser.red.service.redaction.v1.server.redaction.service; package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
@ -24,6 +7,22 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import lombok.RequiredArgsConstructor;
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
public class RedactionLogCreatorService { public class RedactionLogCreatorService {
@ -31,35 +30,27 @@ public class RedactionLogCreatorService {
private final DictionaryService dictionaryService; private final DictionaryService dictionaryService;
public void createRedactionLog(Document classifiedDoc, int numberOfPages, ManualRedactions manualRedactions, public List<RedactionLogEntry> createRedactionLog(PageEntities pageEntities, int numberOfPages,
String ruleSetId) { String dossierTemplateId) {
Set<Integer> manualRedactionPages = getManualRedactionPages(manualRedactions); List<RedactionLogEntry> entries = new ArrayList<>();
for (int page = 1; page <= numberOfPages; page++) { for (int page = 1; page <= numberOfPages; page++) {
if (pageEntities.getEntitiesPerPage().get(page) != null) {
addSectionGrid(classifiedDoc, page); entries.addAll(addEntries(pageEntities.getEntitiesPerPage(), page, dossierTemplateId));
if (classifiedDoc.getEntities().get(page) != null) {
classifiedDoc.getRedactionLogEntities()
.addAll(addEntries(classifiedDoc.getEntities(), manualRedactions, page, ruleSetId));
} }
if (manualRedactionPages.contains(page)) { if (pageEntities.getImagesPerPage().get(page) != null) {
classifiedDoc.getRedactionLogEntities() entries.addAll(addImageEntries(pageEntities.getImagesPerPage(), page, dossierTemplateId));
.addAll(addManualAddEntries(manualRedactions.getEntriesToAdd(), manualRedactions.getComments(), page, ruleSetId)); }
} }
if (classifiedDoc.getImages().get(page) != null && !classifiedDoc.getImages().get(page).isEmpty()) { return entries;
classifiedDoc.getRedactionLogEntities()
.addAll(addImageEntries(classifiedDoc.getImages(), manualRedactions, page, ruleSetId));
}
}
} }
public List<RedactionLogEntry> addImageEntries(Map<Integer, Set<Image>> images, ManualRedactions manualRedactions, public List<RedactionLogEntry> addImageEntries(Map<Integer, Set<Image>> images, int pageNumber,
int pageNumber, String ruleSetId) { String dossierTemplateId) {
List<RedactionLogEntry> redactionLogEntities = new ArrayList<>(); List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
@ -69,14 +60,14 @@ public class RedactionLogCreatorService {
RedactionLogEntry redactionLogEntry = RedactionLogEntry.builder() RedactionLogEntry redactionLogEntry = RedactionLogEntry.builder()
.id(id) .id(id)
.color(getColorForImage(image, ruleSetId, false)) .color(getColor(image.getType(), dossierTemplateId, image.isRedaction()))
.isImage(true) .isImage(true)
.type(image.getType()) .type(image.getType())
.redacted(image.isRedaction()) .redacted(image.isRedaction())
.reason(image.getRedactionReason()) .reason(image.getRedactionReason())
.legalBasis(image.getLegalBasis()) .legalBasis(image.getLegalBasis())
.matchedRule(image.getMatchedRule()) .matchedRule(image.getMatchedRule())
.isHint(dictionaryService.isHint(image.getType(), ruleSetId)) .isHint(dictionaryService.isHint(image.getType(), dossierTemplateId))
.manual(false) .manual(false)
.isDictionaryEntry(false) .isDictionaryEntry(false)
.isRecommendation(false) .isRecommendation(false)
@ -85,62 +76,9 @@ public class RedactionLogCreatorService {
.getWidth(), (float) image.getPosition().getHeight(), pageNumber))) .getWidth(), (float) image.getPosition().getHeight(), pageNumber)))
.sectionNumber(image.getSectionNumber()) .sectionNumber(image.getSectionNumber())
.section(image.getSection()) .section(image.getSection())
.imageHasTransparency(image.isHasTransparency())
.build(); .build();
if (manualRedactions != null && !manualRedactions.getIdsToRemove().isEmpty()) {
for (IdRemoval manualRemoval : manualRedactions.getIdsToRemove()) {
if (manualRemoval.getId().equals(id)) {
String manualOverrideReason = null;
if (manualRemoval.getStatus().equals(Status.APPROVED)) {
image.setRedaction(false);
redactionLogEntry.setRedacted(false);
redactionLogEntry.setStatus(Status.APPROVED);
manualOverrideReason = image.getRedactionReason() + ", removed by manual override";
redactionLogEntry.setColor(getColorForImage(image, ruleSetId, false));
} else if (manualRemoval.getStatus().equals(Status.REQUESTED)) {
manualOverrideReason = image.getRedactionReason() + ", requested to remove";
redactionLogEntry.setStatus(Status.REQUESTED);
redactionLogEntry.setColor(getColorForImage(image, ruleSetId, true));
} else {
redactionLogEntry.setStatus(Status.DECLINED);
}
image.setRedactionReason(manualOverrideReason != null ? manualOverrideReason : image.getRedactionReason());
redactionLogEntry.setReason(manualOverrideReason);
redactionLogEntry.setManual(true);
redactionLogEntry.setManualRedactionType(ManualRedactionType.REMOVE);
}
}
}
if (manualRedactions != null && !manualRedactions.getForceRedacts().isEmpty()) {
for (ManualForceRedact manualForceRedact : manualRedactions.getForceRedacts()) {
if (manualForceRedact.getId().equals(id)) {
String manualOverrideReason = null;
if (manualForceRedact.getStatus().equals(Status.APPROVED)) {
image.setRedaction(true);
redactionLogEntry.setRedacted(true);
redactionLogEntry.setStatus(Status.APPROVED);
redactionLogEntry.setColor(getColorForImage(image, ruleSetId, false));
manualOverrideReason = image.getRedactionReason() + ", forced by manual override";
redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis());
} else if (manualForceRedact.getStatus().equals(Status.REQUESTED)) {
manualOverrideReason = image.getRedactionReason() + ", requested to force redact";
redactionLogEntry.setStatus(Status.REQUESTED);
redactionLogEntry.setColor(getColorForImage(image, ruleSetId, true));
redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis());
} else {
redactionLogEntry.setStatus(Status.DECLINED);
}
image.setRedactionReason(manualOverrideReason != null ? manualOverrideReason : image.getRedactionReason());
redactionLogEntry.setReason(manualOverrideReason);
redactionLogEntry.setManual(true);
redactionLogEntry.setManualRedactionType(ManualRedactionType.FORCE_REDACT);
}
}
}
redactionLogEntities.add(redactionLogEntry); redactionLogEntities.add(redactionLogEntry);
} }
@ -148,25 +86,7 @@ public class RedactionLogCreatorService {
} }
private Set<Integer> getManualRedactionPages(ManualRedactions manualRedactions) { public List<RedactionLogEntry> addEntries(Map<Integer, List<Entity>> entities, int page, String dossierTemplateId) {
Set<Integer> manualRedactionPages = new HashSet<>();
if (manualRedactions == null) {
return manualRedactionPages;
}
manualRedactions.getEntriesToAdd().forEach(entry -> {
entry.getPositions().forEach(pos -> {
manualRedactionPages.add(pos.getPage());
});
});
return manualRedactionPages;
}
public List<RedactionLogEntry> addEntries(Map<Integer, List<Entity>> entities, ManualRedactions manualRedactions,
int page, String ruleSetId) {
List<RedactionLogEntry> redactionLogEntities = new ArrayList<>(); List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
@ -176,11 +96,9 @@ public class RedactionLogCreatorService {
entityLoop: entityLoop:
for (Entity entity : entities.get(page)) { for (Entity entity : entities.get(page)) {
List<Comment> comments = null;
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
RedactionLogEntry redactionLogEntry = createRedactionLogEntry(entity, ruleSetId); RedactionLogEntry redactionLogEntry = createRedactionLogEntry(entity, dossierTemplateId);
if (processedIds.contains(entityPositionSequence.getId())) { if (processedIds.contains(entityPositionSequence.getId())) {
// TODO refactor this outer loop jump as soon as we have the time. // TODO refactor this outer loop jump as soon as we have the time.
@ -189,60 +107,7 @@ public class RedactionLogCreatorService {
processedIds.add(entityPositionSequence.getId()); processedIds.add(entityPositionSequence.getId());
} }
if (manualRedactions != null && !manualRedactions.getIdsToRemove().isEmpty()) { redactionLogEntry.setId(entityPositionSequence.getId());
for (IdRemoval manualRemoval : manualRedactions.getIdsToRemove()) {
if (manualRemoval.getId().equals(entityPositionSequence.getId())) {
comments = manualRedactions.getComments().get(manualRemoval.getId());
String manualOverrideReason = null;
if (manualRemoval.getStatus().equals(Status.APPROVED)) {
entity.setRedaction(false);
redactionLogEntry.setRedacted(false);
redactionLogEntry.setStatus(Status.APPROVED);
manualOverrideReason = entity.getRedactionReason() + ", removed by manual override";
redactionLogEntry.setColor(getColor(entity, ruleSetId, false));
} else if (manualRemoval.getStatus().equals(Status.REQUESTED)) {
manualOverrideReason = entity.getRedactionReason() + ", requested to remove";
redactionLogEntry.setStatus(Status.REQUESTED);
redactionLogEntry.setColor(getColor(entity, ruleSetId, true));
} else {
redactionLogEntry.setStatus(Status.DECLINED);
}
entity.setRedactionReason(manualOverrideReason != null ? manualOverrideReason : entity.getRedactionReason());
redactionLogEntry.setReason(manualOverrideReason);
redactionLogEntry.setManual(true);
redactionLogEntry.setManualRedactionType(ManualRedactionType.REMOVE);
}
}
}
if (manualRedactions != null && !manualRedactions.getForceRedacts().isEmpty()) {
for (ManualForceRedact manualForceRedact : manualRedactions.getForceRedacts()) {
if (manualForceRedact.getId().equals(entityPositionSequence.getId())) {
String manualOverrideReason = null;
if (manualForceRedact.getStatus().equals(Status.APPROVED)) {
entity.setRedaction(true);
redactionLogEntry.setRedacted(true);
redactionLogEntry.setStatus(Status.APPROVED);
redactionLogEntry.setColor(getColor(entity, ruleSetId, false));
manualOverrideReason = entity.getRedactionReason() + ", forced by manual override";
redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis());
} else if (manualForceRedact.getStatus().equals(Status.REQUESTED)) {
manualOverrideReason = entity.getRedactionReason() + ", requested to force redact";
redactionLogEntry.setStatus(Status.REQUESTED);
redactionLogEntry.setColor(getColor(entity, ruleSetId, true));
redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis());
} else {
redactionLogEntry.setStatus(Status.DECLINED);
}
entity.setRedactionReason(manualOverrideReason != null ? manualOverrideReason : entity.getRedactionReason());
redactionLogEntry.setReason(manualOverrideReason);
redactionLogEntry.setManual(true);
redactionLogEntry.setManualRedactionType(ManualRedactionType.FORCE_REDACT);
}
}
}
if (CollectionUtils.isNotEmpty(entityPositionSequence.getSequences())) { if (CollectionUtils.isNotEmpty(entityPositionSequence.getSequences())) {
List<Rectangle> rectanglesPerLine = getRectanglesPerLine(entityPositionSequence.getSequences() List<Rectangle> rectanglesPerLine = getRectanglesPerLine(entityPositionSequence.getSequences()
@ -250,17 +115,10 @@ public class RedactionLogCreatorService {
.flatMap(seq -> seq.getTextPositions().stream()) .flatMap(seq -> seq.getTextPositions().stream())
.collect(Collectors.toList()), page); .collect(Collectors.toList()), page);
if (manualRedactions != null) {
comments = manualRedactions.getComments().get(entityPositionSequence.getId());
}
redactionLogEntry.setComments(comments);
redactionLogEntry.getPositions().addAll(rectanglesPerLine); redactionLogEntry.getPositions().addAll(rectanglesPerLine);
} }
redactionLogEntry.setId(entityPositionSequence.getId());
// FIXME ids should never be null. Figure out why this happens. // FIXME ids should never be null. Figure out why this happens.
if (redactionLogEntry.getId() != null) { if (redactionLogEntry.getId() != null) {
redactionLogEntities.add(redactionLogEntry); redactionLogEntities.add(redactionLogEntry);
@ -276,20 +134,22 @@ public class RedactionLogCreatorService {
List<Rectangle> rectangles = new ArrayList<>(); List<Rectangle> rectangles = new ArrayList<>();
if (textPositions.size() == 1) { if (textPositions.size() == 1) {
rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle()); rectangles.add(TextPositionSequence.fromData(textPositions, page).getRectangle());
} else { } else {
float y = textPositions.get(0).getYDirAdj(); float y = textPositions.get(0).getYDirAdj();
int startIndex = 0; int startIndex = 0;
for (int i = 1; i < textPositions.size(); i++) { for (int i = 1; i < textPositions.size(); i++) {
float yDirAdj = textPositions.get(i).getYDirAdj(); float yDirAdj = textPositions.get(i).getYDirAdj();
if (yDirAdj != y) { if (yDirAdj != y) {
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle()); rectangles.add(TextPositionSequence.fromData(textPositions.subList(startIndex, i), page)
.getRectangle());
y = yDirAdj; y = yDirAdj;
startIndex = i; startIndex = i;
} }
} }
if (startIndex != textPositions.size()) { if (startIndex != textPositions.size()) {
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle()); rectangles.add(TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page)
.getRectangle());
} }
} }
@ -297,80 +157,20 @@ public class RedactionLogCreatorService {
} }
public List<RedactionLogEntry> addManualAddEntries(Set<ManualRedactionEntry> manualAdds, private RedactionLogEntry createRedactionLogEntry(Entity entity, String dossierTemplateId) {
Map<String, List<Comment>> comments, int page,
String ruleSetId) {
List<RedactionLogEntry> redactionLogEntities = new ArrayList<>(); Set<String> referenceIds = new HashSet<>();
entity.getReferences().forEach(ref -> ref.getPositionSequences().forEach(pos -> referenceIds.add(pos.getId())));
if (manualAdds == null) {
return redactionLogEntities;
}
for (ManualRedactionEntry manualRedactionEntry : manualAdds) {
String id = manualRedactionEntry.getId();
RedactionLogEntry redactionLogEntry = createRedactionLogEntry(manualRedactionEntry, id, ruleSetId);
List<Rectangle> rectanglesOnPage = new ArrayList<>();
for (Rectangle rectangle : manualRedactionEntry.getPositions()) {
if (page == rectangle.getPage()) {
rectanglesOnPage.add(rectangle);
redactionLogEntry.getPositions().add(rectangle);
}
}
redactionLogEntry.setComments(comments.get(id));
if (!rectanglesOnPage.isEmpty() && !approvedAndShouldBeInDictionary(manualRedactionEntry)) {
redactionLogEntities.add(redactionLogEntry);
}
}
return redactionLogEntities;
}
private boolean approvedAndShouldBeInDictionary(ManualRedactionEntry manualRedactionEntry) {
return manualRedactionEntry.getStatus().equals(Status.APPROVED) && manualRedactionEntry.isAddToDictionary();
}
private RedactionLogEntry createRedactionLogEntry(ManualRedactionEntry manualRedactionEntry, String id,
String ruleSetId) {
return RedactionLogEntry.builder() return RedactionLogEntry.builder()
.id(id) .color(getColor(entity.getType(), dossierTemplateId, entity.isRedaction()))
.color(getColorForManualAdd(manualRedactionEntry.getType(), ruleSetId, manualRedactionEntry.getStatus()))
.reason(manualRedactionEntry.getReason())
.legalBasis(manualRedactionEntry.getLegalBasis())
.value(manualRedactionEntry.getValue())
.type(manualRedactionEntry.getType())
.redacted(true)
.isHint(false)
.section(manualRedactionEntry.getSection())
.sectionNumber(manualRedactionEntry.getSectionNumber())
.manual(true)
.status(manualRedactionEntry.getStatus())
.manualRedactionType(ManualRedactionType.ADD)
.isDictionaryEntry(false)
.isDossierDictionaryEntry(manualRedactionEntry.isAddToDossierDictionary())
.build();
}
private RedactionLogEntry createRedactionLogEntry(Entity entity, String ruleSetId) {
return RedactionLogEntry.builder()
.color(getColor(entity, ruleSetId, false))
.reason(entity.getRedactionReason()) .reason(entity.getRedactionReason())
.legalBasis(entity.getLegalBasis()) .legalBasis(entity.getLegalBasis())
.value(entity.getWord()) .value(entity.getWord())
.type(entity.getType()) .type(entity.getType())
.redacted(entity.isRedaction()) .redacted(entity.isRedaction())
.isHint(isHint(entity, ruleSetId)) .isHint(isHint(entity.getType(), dossierTemplateId))
.isRecommendation(isRecommendation(entity, ruleSetId)) .isRecommendation(isRecommendation(entity.getType(), dossierTemplateId))
.section(entity.getHeadline()) .section(entity.getHeadline())
.sectionNumber(entity.getSectionNumber()) .sectionNumber(entity.getSectionNumber())
.matchedRule(entity.getMatchedRule()) .matchedRule(entity.getMatchedRule())
@ -380,104 +180,30 @@ public class RedactionLogCreatorService {
.startOffset(entity.getStart()) .startOffset(entity.getStart())
.endOffset(entity.getEnd()) .endOffset(entity.getEnd())
.isDossierDictionaryEntry(entity.isDossierDictionaryEntry()) .isDossierDictionaryEntry(entity.isDossierDictionaryEntry())
.engines(entity.getEngines())
.reference(referenceIds)
.build(); .build();
} }
private float[] getColor(Entity entity, String ruleSetId, boolean requestedToRemove) { private float[] getColor(String type, String dossierTemplateId, boolean isRedaction) {
if (requestedToRemove) { if (!isRedaction && !isHint(type, dossierTemplateId)) {
return dictionaryService.getRequestRemoveColor(ruleSetId); return dictionaryService.getNotRedactedColor(dossierTemplateId);
} }
if (!entity.isRedaction() && !isHint(entity, ruleSetId)) { return dictionaryService.getColor(type, dossierTemplateId);
return dictionaryService.getNotRedactedColor(ruleSetId);
}
return dictionaryService.getColor(entity.getType(), ruleSetId);
} }
private float[] getColorForManualAdd(String type, String ruleSetId, Status status) { private boolean isHint(String type, String dossierTemplateId) {
if (status.equals(Status.REQUESTED)) { return dictionaryService.isHint(type, dossierTemplateId);
return dictionaryService.getRequestAddColor(ruleSetId);
} else if (status.equals(Status.DECLINED)) {
return dictionaryService.getNotRedactedColor(ruleSetId);
}
return getColor(type, ruleSetId);
} }
private float[] getColor(String type, String ruleSetId) { private boolean isRecommendation(String type, String dossierTemplateId) {
return dictionaryService.getColor(type, ruleSetId); return dictionaryService.isRecommendation(type, dossierTemplateId);
}
private float[] getColorForImage(Image image, String ruleSetId, boolean requestedToRemove) {
if (requestedToRemove) {
return dictionaryService.getRequestRemoveColor(ruleSetId);
}
if (!image.isRedaction() && !dictionaryService.isHint(image.getType(), ruleSetId)) {
return dictionaryService.getNotRedactedColor(ruleSetId);
}
return dictionaryService.getColor(image.getType(), ruleSetId);
}
private boolean isHint(Entity entity, String ruleSetId) {
return dictionaryService.isHint(entity.getType(), ruleSetId);
}
private boolean isRecommendation(Entity entity, String ruleSetId) {
return dictionaryService.isRecommendation(entity.getType(), ruleSetId);
}
private void addSectionGrid(Document classifiedDoc, int page) {
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i);
if (textBlock.getPage() != page) {
continue;
}
if (textBlock instanceof TextBlock) {
classifiedDoc.getSectionGrid()
.getRectanglesPerPage()
.computeIfAbsent(page, (x) -> new ArrayList<>())
.add(new SectionRectangle(new Point(textBlock.getMinX(), textBlock.getMinY()), textBlock.getWidth(), textBlock
.getHeight(), i + 1, paragraph.getPageBlocks().size()));
} else if (textBlock instanceof Table) {
List<CellRectangle> cellRectangles = new ArrayList<>();
for (List<Cell> row : ((Table) textBlock).getRows()) {
for (Cell cell : row) {
if (cell != null) {
cellRectangles.add(new CellRectangle(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight()));
}
}
}
classifiedDoc.getSectionGrid()
.getRectanglesPerPage()
.computeIfAbsent(page, (x) -> new ArrayList<>())
.add(new SectionRectangle(new Point(textBlock.getMinX(), textBlock.getMinY()), textBlock.getWidth(), textBlock
.getHeight(), i + 1, paragraph.getPageBlocks().size(), cellRectangles));
}
}
}
} }
} }

View File

@ -0,0 +1,328 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualImageRecategorization;
import com.iqser.red.service.redaction.v1.model.ManualLegalBasisChange;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactionType;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.Status;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class RedactionLogMergeService {
private final DictionaryService dictionaryService;
public RedactionLog mergeRedactionLogData(RedactionLog redactionLog, String dossierTemplateId,
ManualRedactions manualRedactions, Set<Integer> excludedPages) {
log.info("Merging Redaction log with manual redactions {}", manualRedactions);
if (manualRedactions != null) {
var manualRedactionLogEntries = addManualAddEntries(manualRedactions.getEntriesToAdd(), manualRedactions.getComments(), dossierTemplateId);
redactionLog.getRedactionLogEntry().addAll(manualRedactionLogEntries);
var manualRedactionWrappers = createManualRedactionWrappers(manualRedactions);
for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) {
processRedactionLogEntry(manualRedactionWrappers.stream()
.filter(mr -> entry.getId().equals(mr.getId()))
.collect(Collectors.toList()), dossierTemplateId, entry);
entry.setComments(manualRedactions.getComments().get(entry.getId()));
if (excludedPages != null && !excludedPages.isEmpty()) {
entry.getPositions().forEach(pos -> {
if (excludedPages.contains(pos.getPage())) {
entry.setExcluded(true);
}
});
}
}
}
return redactionLog;
}
private List<ManualRedactionWrapper> createManualRedactionWrappers(ManualRedactions manualRedactions) {
List<ManualRedactionWrapper> manualRedactionWrappers = new ArrayList<>();
manualRedactions.getImageRecategorizations().forEach(item -> {
if (item.getSoftDeletedTime() == null) {
manualRedactionWrappers.add(new ManualRedactionWrapper(item.getId(), item.getRequestDate(), item));
}
});
manualRedactions.getIdsToRemove().forEach(item -> {
if (item.getSoftDeletedTime() == null) {
manualRedactionWrappers.add(new ManualRedactionWrapper(item.getId(), item.getRequestDate(), item));
}
});
manualRedactions.getForceRedacts().forEach(item -> {
if (item.getSoftDeletedTime() == null) {
manualRedactionWrappers.add(new ManualRedactionWrapper(item.getId(), item.getRequestDate(), item));
}
});
manualRedactions.getManualLegalBasisChanges().forEach(item -> {
if (item.getSoftDeletedTime() == null) {
manualRedactionWrappers.add(new ManualRedactionWrapper(item.getId(), item.getRequestDate(), item));
}
});
Collections.sort(manualRedactionWrappers);
return manualRedactionWrappers;
}
private void processRedactionLogEntry(List<ManualRedactionWrapper> manualRedactionWrappers,
String dossierTemplateId, RedactionLogEntry redactionLogEntry) {
manualRedactionWrappers.forEach(mrw -> {
if (mrw.getItem() instanceof ManualImageRecategorization) {
var imageRecategorization = (ManualImageRecategorization) mrw.getItem();
String manualOverrideReason = null;
if (imageRecategorization.getStatus().equals(Status.APPROVED)) {
redactionLogEntry.setStatus(Status.APPROVED);
redactionLogEntry.setType(imageRecategorization.getType());
manualOverrideReason = mergeReasonIfNecessary(redactionLogEntry.getReason(), ", recategorized by manual override");
} else if (imageRecategorization.getStatus().equals(Status.REQUESTED)) {
manualOverrideReason = mergeReasonIfNecessary(redactionLogEntry.getReason(), ", requested to recategorize");
redactionLogEntry.setStatus(Status.REQUESTED);
redactionLogEntry.setColor(getColor(redactionLogEntry.getType(), dossierTemplateId, false, redactionLogEntry
.isRedacted(), false));
redactionLogEntry.setRecategorizationType(imageRecategorization.getType());
} else {
redactionLogEntry.setStatus(Status.DECLINED);
}
redactionLogEntry.setManualRedactionUserId(imageRecategorization.getUser());
redactionLogEntry.setReason(manualOverrideReason);
redactionLogEntry.setManual(true);
redactionLogEntry.setManualRedactionType(ManualRedactionType.RECATEGORIZE);
}
if (mrw.getItem() instanceof IdRemoval) {
var manualRemoval = (IdRemoval) mrw.getItem();
String manualOverrideReason = null;
if (manualRemoval.getStatus().equals(Status.APPROVED)) {
redactionLogEntry.setRedacted(false);
redactionLogEntry.setStatus(Status.APPROVED);
manualOverrideReason = mergeReasonIfNecessary(redactionLogEntry.getReason(), ", removed by manual override");
redactionLogEntry.setColor(getColor(redactionLogEntry.getType(), dossierTemplateId, false, redactionLogEntry
.isRedacted(), true));
} else if (manualRemoval.getStatus().equals(Status.REQUESTED)) {
manualOverrideReason = mergeReasonIfNecessary(redactionLogEntry.getReason(), ", requested to remove");
redactionLogEntry.setStatus(Status.REQUESTED);
redactionLogEntry.setColor(getColor(redactionLogEntry.getType(), dossierTemplateId, true, redactionLogEntry
.isRedacted(), false));
} else {
redactionLogEntry.setStatus(Status.DECLINED);
}
redactionLogEntry.setReason(manualOverrideReason);
redactionLogEntry.setManual(true);
redactionLogEntry.setManualRedactionUserId(manualRemoval.getUser());
redactionLogEntry.setManualRedactionType(ManualRedactionType.REMOVE);
redactionLogEntry.setDictionaryEntry(manualRemoval.isRemoveFromDictionary());
redactionLogEntry.setDossierDictionaryEntry(manualRemoval.isRemoveFromDictionary());
}
if (mrw.getItem() instanceof ManualForceRedact) {
var manualForceRedact = (ManualForceRedact) mrw.getItem();
String manualOverrideReason = null;
if (manualForceRedact.getStatus().equals(Status.APPROVED)) {
redactionLogEntry.setRedacted(true);
redactionLogEntry.setStatus(Status.APPROVED);
redactionLogEntry.setColor(getColor(redactionLogEntry.getType(), dossierTemplateId, false, redactionLogEntry
.isRedacted(), false));
manualOverrideReason = mergeReasonIfNecessary(redactionLogEntry.getReason(), ", forced by manual override");
redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis());
} else if (manualForceRedact.getStatus().equals(Status.REQUESTED)) {
manualOverrideReason = mergeReasonIfNecessary(redactionLogEntry.getReason(), ", requested to force redact");
redactionLogEntry.setStatus(Status.REQUESTED);
redactionLogEntry.setColor(getColor(redactionLogEntry.getType(), dossierTemplateId, true, redactionLogEntry
.isRedacted(), false));
redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis());
} else {
redactionLogEntry.setStatus(Status.DECLINED);
}
redactionLogEntry.setManualRedactionUserId(manualForceRedact.getUser());
redactionLogEntry.setReason(manualOverrideReason);
redactionLogEntry.setManual(true);
redactionLogEntry.setManualRedactionType(ManualRedactionType.FORCE_REDACT);
}
if (mrw.getItem() instanceof ManualLegalBasisChange) {
var manualLegalBasisChange = (ManualLegalBasisChange) mrw.getItem();
String manualOverrideReason = null;
if (manualLegalBasisChange.getStatus().equals(Status.APPROVED)) {
redactionLogEntry.setStatus(Status.APPROVED);
manualOverrideReason = mergeReasonIfNecessary(redactionLogEntry.getReason(), ", legal basis was manually changed");
redactionLogEntry.setLegalBasis(manualLegalBasisChange.getLegalBasis());
} else if (manualLegalBasisChange.getStatus().equals(Status.REQUESTED)) {
manualOverrideReason = mergeReasonIfNecessary(redactionLogEntry.getReason(), ", legal basis change requested");
redactionLogEntry.setStatus(Status.REQUESTED);
redactionLogEntry.setColor(getColor(redactionLogEntry.getType(), dossierTemplateId, true, redactionLogEntry
.isRedacted(), false));
redactionLogEntry.setLegalBasisChangeValue(manualLegalBasisChange.getLegalBasis());
} else {
redactionLogEntry.setStatus(Status.DECLINED);
}
redactionLogEntry.setManualRedactionUserId(manualLegalBasisChange.getUser());
redactionLogEntry.setReason(manualOverrideReason);
redactionLogEntry.setManual(true);
redactionLogEntry.setManualRedactionType(ManualRedactionType.LEGAL_BASIS_CHANGE);
}
});
}
private String mergeReasonIfNecessary(String currentReason, String addition) {
if (currentReason != null) {
if (!currentReason.contains(addition)) {
return currentReason + addition;
}
return currentReason;
} else {
return "";
}
}
public List<RedactionLogEntry> addManualAddEntries(Set<ManualRedactionEntry> manualAdds,
Map<String, List<Comment>> comments, String dossierTemplateId) {
List<RedactionLogEntry> redactionLogEntries = new ArrayList<>();
for (ManualRedactionEntry manualRedactionEntry : manualAdds) {
if (!approvedAndShouldBeInDictionary(manualRedactionEntry)) {
RedactionLogEntry redactionLogEntry = createRedactionLogEntry(manualRedactionEntry, manualRedactionEntry
.getId(), dossierTemplateId);
redactionLogEntry.setPositions(manualRedactionEntry.getPositions());
redactionLogEntry.setComments(comments.get(manualRedactionEntry.getId()));
redactionLogEntries.add(redactionLogEntry);
}
}
return redactionLogEntries;
}
private boolean approvedAndShouldBeInDictionary(ManualRedactionEntry manualRedactionEntry) {
return manualRedactionEntry.getStatus()
.equals(Status.APPROVED) && (manualRedactionEntry.isAddToDictionary() || manualRedactionEntry.isAddToDossierDictionary());
}
private RedactionLogEntry createRedactionLogEntry(ManualRedactionEntry manualRedactionEntry, String id,
String dossierTemplateId) {
return RedactionLogEntry.builder()
.id(id)
.color(getColorForManualAdd(manualRedactionEntry.getType(), dossierTemplateId, manualRedactionEntry.getStatus()))
.reason(manualRedactionEntry.getReason())
.isDictionaryEntry(manualRedactionEntry.isAddToDictionary())
.isDossierDictionaryEntry(manualRedactionEntry.isAddToDossierDictionary())
.legalBasis(manualRedactionEntry.getLegalBasis())
.value(manualRedactionEntry.getValue())
.type(manualRedactionEntry.getType())
.redacted(true)
.isHint(false)
.section(null)
.sectionNumber(-1)
.manual(true)
.status(manualRedactionEntry.getStatus())
.manualRedactionType(ManualRedactionType.ADD)
.manualRedactionUserId(manualRedactionEntry.getUser())
.build();
}
private float[] getColor(String type, String dossierTemplateId, boolean requested, boolean isRedaction,
boolean skipped) {
if (requested) {
return dictionaryService.getRequestRemoveColor(dossierTemplateId);
}
if (skipped || !isRedaction && !dictionaryService.isHint(type, dossierTemplateId)) {
return dictionaryService.getNotRedactedColor(dossierTemplateId);
}
return dictionaryService.getColor(type, dossierTemplateId);
}
private float[] getColorForManualAdd(String type, String dossierTemplateId, Status status) {
if (status.equals(Status.REQUESTED)) {
return dictionaryService.getRequestAddColor(dossierTemplateId);
} else if (status.equals(Status.DECLINED)) {
return dictionaryService.getNotRedactedColor(dossierTemplateId);
}
return getColor(type, dossierTemplateId);
}
private float[] getColor(String type, String dossierTemplateId) {
return dictionaryService.getColor(type, dossierTemplateId);
}
@Data
@AllArgsConstructor
private static class ManualRedactionWrapper implements Comparable<ManualRedactionWrapper> {
private String id;
private OffsetDateTime date;
private Object item;
@Override
public int compareTo(ManualRedactionWrapper o) {
return this.date.compareTo(o.date);
}
}
}

View File

@ -0,0 +1,76 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.CellRectangle;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class SectionGridCreatorService {
public void createSectionGrid(Document classifiedDoc, int numberOfPages) {
for (int page = 1; page <= numberOfPages; page++) {
addSectionGrid(classifiedDoc, page);
}
}
private void addSectionGrid(Document classifiedDoc, int page) {
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i);
if (textBlock.getPage() != page) {
continue;
}
if (textBlock instanceof TextBlock) {
classifiedDoc.getSectionGrid()
.getRectanglesPerPage()
.computeIfAbsent(page, (x) -> new ArrayList<>())
.add(new SectionRectangle(new Point(textBlock.getMinX(), textBlock.getMinY()), textBlock.getWidth(), textBlock
.getHeight(), i + 1, paragraph.getPageBlocks().size()));
} else if (textBlock instanceof Table) {
List<CellRectangle> cellRectangles = new ArrayList<>();
for (List<Cell> row : ((Table) textBlock).getRows()) {
for (Cell cell : row) {
if (cell != null) {
cellRectangles.add(new CellRectangle(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight()));
}
}
}
classifiedDoc.getSectionGrid()
.getRectanglesPerPage()
.computeIfAbsent(page, (x) -> new ArrayList<>())
.add(new SectionRectangle(new Point(textBlock.getMinX(), textBlock.getMinY()), textBlock.getWidth(), textBlock
.getHeight(), i + 1, paragraph.getPageBlocks().size(), cellRectangles));
}
}
}
}
}

View File

@ -0,0 +1,220 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class SectionTextBuilderService {
public List<SectionText> buildSectionText(Document classifiedDoc) {
List<SectionText> sectionTexts = new ArrayList<>();
AtomicInteger sectionNumber = new AtomicInteger(1);
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
List<Table> tables = paragraph.getTables();
for (Table table : tables) {
if (table.getColCount() == 2) {
sectionTexts.add(processTableAsOneText(table, sectionNumber));
} else {
sectionTexts.addAll(processTablePerRow(table, sectionNumber));
}
sectionNumber.incrementAndGet();
}
sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph
.getImages()));
sectionNumber.incrementAndGet();
}
for (Header header : classifiedDoc.getHeaders()) {
sectionTexts.add(processText(header.getSearchableText(), header.getTextBlocks(), "Header", sectionNumber, new ArrayList<>()));
sectionNumber.incrementAndGet();
}
for (Footer footer : classifiedDoc.getFooters()) {
sectionTexts.add(processText(footer.getSearchableText(), footer.getTextBlocks(), "Footer", sectionNumber, new ArrayList<>()));
sectionNumber.incrementAndGet();
}
for (UnclassifiedText unclassifiedText : classifiedDoc.getUnclassifiedTexts()) {
sectionTexts.add(processText(unclassifiedText.getSearchableText(), unclassifiedText.getTextBlocks(), "", sectionNumber, new ArrayList<>()));
sectionNumber.incrementAndGet();
}
return sectionTexts;
}
private List<SectionText> processTablePerRow(Table table, AtomicInteger sectionNumber) {
List<SectionText> sectionTexts = new ArrayList<>();
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
SectionText sectionText = new SectionText();
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage());
sectionText.getSectionAreas().add(sectionArea);
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
int cellStart = start;
if (!cell.isHeaderCell()) {
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
sectionArea.setHeader(headerName);
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
}
for (TextBlock textBlock : cell.getTextBlocks()) {
// TODO avoid cell overlap merging.
searchableRow.addAll(textBlock.getSequences());
}
cellStarts.add(cellStart);
start = start + cell.toString().trim().length() + 1;
}
sectionText.setText(searchableRow.toString());
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
sectionText.setTabularData(tabularData);
sectionText.setCellStarts(cellStarts);
sectionTexts.add(sectionText);
sectionNumber.incrementAndGet();
}
return sectionTexts;
}
private SectionText processTableAsOneText(Table table, AtomicInteger sectionNumber) {
SearchableText entireTableText = new SearchableText();
SectionText sectionText = new SectionText();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage());
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
sectionText.getSectionAreas().add(sectionArea);
for (TextBlock textBlock : cell.getTextBlocks()) {
entireTableText.addAll(textBlock.getSequences());
}
cellStarts.add(start);
start = start + cell.toString().trim().length() + 1;
}
}
sectionText.setCellStarts(cellStarts);
sectionText.setText(entireTableText.toString());
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
return sectionText;
}
private SectionText processText(SearchableText searchableText, List<TextBlock> paragraphTextBlocks, String headline,
AtomicInteger sectionNumber, List<PdfImage> images) {
SectionText sectionText = new SectionText();
for (TextBlock paragraphTextBlock : paragraphTextBlocks) {
SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock
.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage());
sectionText.getSectionAreas().add(sectionArea);
}
sectionText.setText(searchableText.toString());
sectionText.setHeadline(headline);
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(false);
sectionText.setImages(images.stream()
.map(image -> convertImage(image, sectionNumber.intValue(), headline))
.collect(Collectors.toSet()));
sectionText.setTextBlocks(paragraphTextBlocks);
return sectionText;
}
private Image convertImage(PdfImage pdfImage, int sectionNumber, String headline) {
return Image.builder()
.type(pdfImage.getImageType().equals(ImageType.OTHER) ? "image" : pdfImage.getImageType()
.name()
.toLowerCase(Locale.ROOT))
.position(pdfImage.getPosition())
.sectionNumber(sectionNumber)
.section(headline)
.page(pdfImage.getPage())
.hasTransparency(pdfImage.isHasTransparency())
.build();
}
}

View File

@ -1,10 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils; package com.iqser.red.service.redaction.v1.server.redaction.utils;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -17,7 +19,6 @@ import java.util.stream.Collectors;
@SuppressWarnings("PMD") @SuppressWarnings("PMD")
public class EntitySearchUtils { public class EntitySearchUtils {
public boolean sectionContainsAny(String sectionText, Set<DictionaryIncrementValue> values) { public boolean sectionContainsAny(String sectionText, Set<DictionaryIncrementValue> values) {
String inputString = sectionText.toLowerCase(Locale.ROOT); String inputString = sectionText.toLowerCase(Locale.ROOT);
@ -38,10 +39,8 @@ public class EntitySearchUtils {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if (value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())) {
return true; return true;
} }
}
} while (startIndex > -1); } while (startIndex > -1);
} }
return false; return false;
@ -49,7 +48,7 @@ public class EntitySearchUtils {
public Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber, public Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber,
boolean local, boolean isDossierDictionary) { boolean isDictionaryEntry, boolean isDossierDictionary, Engine engine) {
Set<Entity> found = new HashSet<>(); Set<Entity> found = new HashSet<>();
@ -69,7 +68,7 @@ public class EntitySearchUtils {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary)); found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, isDictionaryEntry, isDossierDictionary, engine));
} }
} while (startIndex > -1); } while (startIndex > -1);
} }
@ -98,8 +97,8 @@ public class EntitySearchUtils {
.sorted(Comparator.comparing(Entity::getStart)) .sorted(Comparator.comparing(Entity::getStart))
.collect(Collectors.toList()); .collect(Collectors.toList());
Entity firstEntity = orderedEntities.get(0); Entity firstEntity = orderedEntities.get(0);
List<EntityPositionSequence> positionSequences = text.getSequences(firstEntity.getWord().trim(), dictionary.isCaseInsensitiveDictionary(firstEntity List<EntityPositionSequence> positionSequences = text.getSequences(firstEntity.getWord()
.getType()), firstEntity.getTargetSequences()); .trim(), dictionary.isCaseInsensitiveDictionary(firstEntity.getType()), firstEntity.getTargetSequences());
for (int i = 0; i <= orderedEntities.size() - 1; i++) { for (int i = 0; i <= orderedEntities.size() - 1; i++) {
try { try {
@ -133,6 +132,7 @@ public class EntitySearchUtils {
public void addEntitiesWithHigherRank(Set<Entity> entities, Set<Entity> found, Dictionary dictionary) { public void addEntitiesWithHigherRank(Set<Entity> entities, Set<Entity> found, Dictionary dictionary) {
found.forEach(f -> addEntitiesWithHigherRank(entities, f, dictionary)); found.forEach(f -> addEntitiesWithHigherRank(entities, f, dictionary));
} }
@ -143,14 +143,33 @@ public class EntitySearchUtils {
Entity existing = entities.stream().filter(entity -> entity.equals(found)).findFirst().get(); Entity existing = entities.stream().filter(entity -> entity.equals(found)).findFirst().get();
if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) { if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) {
entities.remove(found); entities.remove(found);
entities.add(found);
} else {
existing.getEngines().addAll(found.getEngines());
} }
} } else {
entities.add(found); entities.add(found);
} }
}
public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found) { public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found) {
// HashSet keeps old value but we want the new. // HashSet keeps old value but we want the new.
entities.removeAll(found); entities.removeAll(found);
entities.addAll(found); entities.addAll(found);
} }
public void addOrAddEngine(Set<Entity> existing, Set<Entity> toBeAdded){
for(Entity toAdd: toBeAdded){
if (existing.contains(toAdd)) {
Entity existingEntity = existing.stream().filter(entity -> entity.equals(toAdd)).findFirst().get();
existingEntity.getEngines().addAll(toAdd.getEngines());
} else {
existing.add(toAdd);
}
}
}
} }

View File

@ -12,7 +12,7 @@ public class TextNormalizationUtilities {
* @return Text without line-break hyphenation. * @return Text without line-break hyphenation.
*/ */
public static String removeHyphenLineBreaks(String text) { public static String removeHyphenLineBreaks(String text) {
return text.replaceAll("([^\\s\\d\\-]{2,})[\\-\\u00AD]\\R|\n\r(.+ )", "$1$2"); return text.replaceAll("([^\\s\\d\\-]{2,})[\\-\\u00AD]\\R", "$1");
} }
} }

View File

@ -0,0 +1,165 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.awt.Graphics;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class ImageMergeService {
public List<PdfImage> mergeImages(List<PdfImage> images, int rotation){
List<PdfImage> mergedList = processImages(images, rotation);
List<PdfImage> imagesInImage = new ArrayList<>();
for(PdfImage image: mergedList){
for (PdfImage inner: mergedList){
if(image != inner && image.getPosition().contains(inner.getPosition().getX(), inner.getPosition().getY(), inner.getPosition().getWidth(), inner.getPosition().getHeight())){
imagesInImage.add(inner);
}
}
}
mergedList.removeAll(imagesInImage);
return mergedList;
}
//merge images, if they are separated during pdf import, return new list of Pdfimages
private List<PdfImage> processImages(List<PdfImage> imageList, int rotation) {
if (imageList.size() > 1) {
List<PdfImage> mergedList = new ArrayList<>();
int countElementsInList = 0;
boolean beginImage = true;
// a List of Boolean, true = candidate for merging, false = no merging
List<Boolean> candidatesList = getCandidatesList(imageList, rotation);
// loop through list, if there are candidates for merging (true), merge images and add it to mergedList
for (int i = 0; i < candidatesList.size(); i++) {
if (candidatesList.get(i)) {
if (beginImage) {
//begin of image, merge two parts of imageList
PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1), rotation);
// image merge successful
if (mergedImage != null) {
mergedList.add(mergedImage);
countElementsInList++;
}
} else {
//middle of an image, merge current piece auf mergedList with image of imageList
PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1), rotation);
// image merge successful
if (mergedImage != null) {
mergedList.set(countElementsInList - 1, mergedImage);
}
}
beginImage = false;
} else {
// if the last candidate is false, then both images i and i+1 must be added
if (i == candidatesList.size() - 1) {
if (countElementsInList > 0 && mergedList.get(countElementsInList - 1) == imageList.get(i)) {
mergedList.add(imageList.get(i + 1));
} else {
mergedList.add(imageList.get(i));
mergedList.add(imageList.get(i + 1));
}
} else {
//first image is not splitted, add i to resultlist
if (beginImage) {
mergedList.add(imageList.get(i));
countElementsInList++;
} else {
// i is the end of an image, add begin of new image
mergedList.add(imageList.get(i + 1));
countElementsInList++;
beginImage = false;
}
}
}
}
return mergedList;
} else {
return imageList;
}
}
private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2, int rotation) {
// diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten
double width = image1.getPosition().getWidth();
double width2 = image2.getPosition().getWidth();
double height1 = image1.getPosition().getHeight();
double height2 = image2.getPosition().getHeight();
// mit den Werten, die unter Image gespeichert sind, funktioniert es
double img1height = image1.getImage().getHeight();
double img1width = image1.getImage().getWidth();
double img2height = image2.getImage().getHeight();
BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height + img2height), BufferedImage.TYPE_INT_RGB);
Graphics mergedImageGraphics = mergedImage.getGraphics();
try {
mergedImageGraphics.drawImage(image1.getImage(), 0, 0, null);
mergedImageGraphics.drawImage(image2.getImage(), 0, (int) (img1height), null);
// set Image, Position and type for merged Image
//set position for merged image with values of image1 and the height of both
Rectangle2D pos = new Rectangle2D.Float();
pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), rotation == 90 ? width + width2: width, rotation == 90 ? height1 : height1 + height2);
PdfImage newPdfImage = new PdfImage(mergedImage, pos, image1.getPage(), image1.isHasTransparency() || image2.isHasTransparency());
// Graphics need to be disposed
image1.getImage().flush();
image2.getImage().flush();
mergedImage.flush();
mergedImageGraphics.dispose();
return newPdfImage;
} catch (Exception e) {
// failed to merge image
log.error("Failed to merge image", e);
return null;
}
}
//make a list of true and false, if the image is a candidate for merging
private List<Boolean> getCandidatesList(List<PdfImage> imageList, int rotation) {
List<Boolean> candidatesList = new ArrayList<>();
for (int i = 0; i < imageList.size(); i++) {
if (i >= 1) {
candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i), rotation));
}
}
return candidatesList;
}
// evaluate if two images are candidates for merging, depending on their coordinates, width and height
private boolean isCandidateForMerging(PdfImage image1, PdfImage image2, int rotation) {
double x1 = rotation == 90 ? image1.getPosition().getY() : image1.getPosition().getX();
double y1 = rotation == 90 ? image1.getPosition().getX() : image1.getPosition().getY();
double width1 = rotation == 90 ? image1.getPosition().getHeight() : image1.getPosition().getWidth();
double x2 = rotation == 90 ? image2.getPosition().getY() : image2.getPosition().getX();
double y2 = rotation == 90 ? image2.getPosition().getX() : image2.getPosition().getY();
double width2 = rotation == 90 ? image2.getPosition().getHeight() : image2.getPosition().getWidth();
double height2 = rotation == 90 ? image2.getPosition().getWidth() : image2.getPosition().getHeight();
//if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates,
// then it is the same picture and has to be merged -> return true
return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(rotation == 90 ? y2 - y1 : y1 - y2) && width2 > (height2 / 6);
}
}

View File

@ -1,6 +1,19 @@
package com.iqser.red.service.redaction.v1.server.segmentation; package com.iqser.red.service.redaction.v1.server.segmentation;
import com.iqser.red.service.redaction.v1.model.Rectangle; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
@ -15,24 +28,9 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import java.awt.Graphics;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
@Slf4j @Slf4j
@Service @Service
@ -47,13 +45,17 @@ public class PdfSegmentationService {
private final ClassificationService classificationService; private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService; private final SectionsBuilderService sectionsBuilderService;
private final ImageClassificationService imageClassificationService; private final ImageClassificationService imageClassificationService;
private final ImageMergeService imageMergeService;
public Document parseDocument(InputStream documentInputStream) throws IOException { public Document parseDocument(InputStream documentInputStream) throws IOException {
return parseDocument(documentInputStream, false); return parseDocument(documentInputStream, false);
} }
public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException { public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException {
PDDocument pdDocument = null; PDDocument pdDocument = null;
try { try {
//create tempFile //create tempFile
@ -64,7 +66,6 @@ public class PdfSegmentationService {
Document document = new Document(); Document document = new Document();
List<Page> pages = new ArrayList<>(); List<Page> pages = new ArrayList<>();
pdDocument = reinitializePDDocument(tempFile, null); pdDocument = reinitializePDDocument(tempFile, null);
long pageCount = pdDocument.getNumberOfPages(); long pageCount = pdDocument.getNumberOfPages();
@ -101,32 +102,19 @@ public class PdfSegmentationService {
page.setRotation(rotation); page.setRotation(rotation);
page.setLandscape(isLandscape || isRotated); page.setLandscape(isLandscape || isRotated);
page.setPageNumber(pageNumber); page.setPageNumber(pageNumber);
List<PdfImage> mergedList = processImages(stripper.getImages());
List<PdfImage> imagesInImage = new ArrayList<>();
for(PdfImage image: mergedList){
for (PdfImage inner: mergedList){
if(image != inner && image.getPosition().contains(inner.getPosition().getX(), inner.getPosition().getY(), inner.getPosition().getWidth(), inner.getPosition().getHeight())){
imagesInImage.add(inner);
}
}
}
mergedList.removeAll(imagesInImage);
List<PdfImage> mergedList = imageMergeService.mergeImages(stripper.getImages(), rotation);
page.setImages(mergedList); page.setImages(mergedList);
tableExtractionService.extractTables(cleanRulings, page); tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page); buildPageStatistics(page);
increaseDocumentStatistics(page, document); increaseDocumentStatistics(page, document);
if (!ignoreImages) { if (!ignoreImages) {
imageClassificationService.classifyImages(page); imageClassificationService.classifyImages(page);
} }
pages.add(page); pages.add(page);
} }
document.setPages(pages); document.setPages(pages);
@ -149,7 +137,9 @@ public class PdfSegmentationService {
} }
} }
private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException { private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException {
if (pdDocument != null) { if (pdDocument != null) {
pdDocument.close(); pdDocument.close();
} }
@ -164,130 +154,6 @@ public class PdfSegmentationService {
return newPDDocument; return newPDDocument;
} }
//merge images, if they are separated during pdf import, return new list of Pdfimages
private List<PdfImage> processImages(List<PdfImage> imageList) {
if (imageList.size() > 1) {
List<PdfImage> mergedList = new ArrayList<>();
int countElementsInList = 0;
boolean beginImage = true;
// a List of Boolean, true = candidate for merging, false = no merging
List<Boolean> candidatesList = getCandidatesList(imageList);
// loop through list, if there are candidates for merging (true), merge images and add it to mergedList
for (int i = 0; i < candidatesList.size(); i++) {
if (candidatesList.get(i)) {
if (beginImage) {
//begin of image, merge two parts of imageList
PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1));
// image merge successful
if (mergedImage != null) {
mergedList.add(mergedImage);
countElementsInList++;
}
} else {
//middle of an image, merge current piece auf mergedList with image of imageList
PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1));
// image merge successful
if (mergedImage != null) {
mergedList.set(countElementsInList - 1, mergedImage);
}
}
beginImage = false;
} else {
// if the last candidate is false, then both images i and i+1 must be added
if (i == candidatesList.size() - 1) {
if (countElementsInList > 0 && mergedList.get(countElementsInList - 1) == imageList.get(i)) {
mergedList.add(imageList.get(i + 1));
} else {
mergedList.add(imageList.get(i));
mergedList.add(imageList.get(i + 1));
}
} else {
//first image is not splitted, add i to resultlist
if (beginImage) {
mergedList.add(imageList.get(i));
countElementsInList++;
} else {
// i is the end of an image, add begin of new image
mergedList.add(imageList.get(i + 1));
countElementsInList++;
beginImage = false;
}
}
}
}
return mergedList;
} else {
return imageList;
}
}
private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2) {
// diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten
double width = image1.getPosition().getWidth();
double height1 = image1.getPosition().getHeight();
double height2 = image2.getPosition().getHeight();
// mit den Werten, die unter Image gespeichert sind, funktioniert es
double img1height = image1.getImage().getHeight();
double img1width = image1.getImage().getWidth();
double img2height = image2.getImage().getHeight();
BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height + img2height), BufferedImage.TYPE_INT_RGB);
Graphics mergedImageGraphics = mergedImage.getGraphics();
try {
mergedImageGraphics.drawImage(image1.getImage(), 0, 0, null);
mergedImageGraphics.drawImage(image2.getImage(), 0, (int) (img1height), null);
// set Image, Position and type for merged Image
//set position for merged image with values of image1 and the height of both
Rectangle2D pos = new Rectangle2D.Float();
pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), width, height1 + height2);
PdfImage newPdfImage = new PdfImage(mergedImage, pos, image1.getPage());
// Graphics need to be disposed
image1.getImage().flush();
image2.getImage().flush();
mergedImage.flush();
mergedImageGraphics.dispose();
return newPdfImage;
} catch (Exception e) {
// failed to merge image
log.error("Failed to merge image", e);
return null;
}
}
//make a list of true and false, if the image is a candidate for merging
private List<Boolean> getCandidatesList(List<PdfImage> imageList) {
List<Boolean> candidatesList = new ArrayList<>();
for (int i = 0; i < imageList.size(); i++) {
if (i >= 1) {
candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i)));
}
}
return candidatesList;
}
// evaluate if two images are candidates for merging, depending on their coordinates, width and height
private boolean isCandidateForMerging(PdfImage image1, PdfImage image2) {
double x1 = image1.getPosition().getX();
double y1 = image1.getPosition().getY();
double width1 = image1.getPosition().getWidth();
double x2 = image2.getPosition().getX();
double y2 = image2.getPosition().getY();
double width2 = image2.getPosition().getWidth();
double height2 = image2.getPosition().getHeight();
//if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates,
// then it is the same picture and has to be merged -> return true
return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(y1 - y2) && width2 > (height2 / 6);
}
private void increaseDocumentStatistics(Page page, Document document) { private void increaseDocumentStatistics(Page page, Document document) {
@ -319,5 +185,4 @@ public class PdfSegmentationService {
} }
} }

View File

@ -15,4 +15,8 @@ public class RedactionServiceSettings {
private float maxImageCropboxRatio = 0.9f; private float maxImageCropboxRatio = 0.9f;
private int analysisVersion = 1;
private boolean enableEntityRecognition = true;
} }

View File

@ -1,10 +1,11 @@
package com.iqser.red.service.redaction.v1.server.storage; package com.iqser.red.service.redaction.v1.server.storage;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.file.management.v1.api.model.FileType; import com.iqser.red.service.persistence.service.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.RedactionLog; import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.SectionGrid; import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.server.classification.model.Text; import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist; import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.service.StorageService;
import lombok.Getter; import lombok.Getter;
@ -32,16 +33,16 @@ public class RedactionStorageService {
@SneakyThrows @SneakyThrows
public void storeObject(String projectId, String fileId, FileType fileType, Object any) { public void storeObject(String dossierId, String fileId, FileType fileType, Object any) {
storageService.storeObject(StorageIdUtils.getStorageId(projectId, fileId, fileType), objectMapper.writeValueAsBytes(any)); storageService.storeObject(StorageIdUtils.getStorageId(dossierId, fileId, fileType), objectMapper.writeValueAsBytes(any));
} }
public RedactionLog getRedactionLog(String projectId, String fileId) { public RedactionLog getRedactionLog(String dossierId, String fileId) {
InputStreamResource inputStreamResource; InputStreamResource inputStreamResource;
try { try {
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.REDACTION_LOG)); inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(dossierId, fileId, FileType.REDACTION_LOG));
} catch (StorageObjectDoesNotExist e) { } catch (StorageObjectDoesNotExist e) {
log.debug("Text not available."); log.debug("Text not available.");
return null; return null;
@ -55,11 +56,11 @@ public class RedactionStorageService {
} }
public Text getText(String projectId, String fileId) { public Text getText(String dossierId, String fileId) {
InputStreamResource inputStreamResource; InputStreamResource inputStreamResource;
try { try {
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.TEXT)); inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(dossierId, fileId, FileType.TEXT));
} catch (StorageObjectDoesNotExist e) { } catch (StorageObjectDoesNotExist e) {
log.debug("Text not available."); log.debug("Text not available.");
return null; return null;
@ -73,9 +74,28 @@ public class RedactionStorageService {
} }
public SectionGrid getSectionGrid(String projectId, String fileId) { public NerEntities getNerEntities(String dossierId, String fileId) {
var sectionGrid = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.SECTION_GRID)); InputStreamResource inputStreamResource;
try {
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(dossierId, fileId, FileType.NER_ENTITIES));
} catch (StorageObjectDoesNotExist e) {
log.debug("NER Entities not available.");
return null;
}
try {
return objectMapper.readValue(inputStreamResource.getInputStream(), NerEntities.class);
} catch (IOException e) {
throw new RuntimeException("Could not convert NerEntities", e);
}
}
public SectionGrid getSectionGrid(String dossierId, String fileId) {
var sectionGrid = storageService.getObject(StorageIdUtils.getStorageId(dossierId, fileId, FileType.SECTION_GRID));
try { try {
return objectMapper.readValue(sectionGrid.getInputStream(), SectionGrid.class); return objectMapper.readValue(sectionGrid.getInputStream(), SectionGrid.class);
} catch (IOException e) { } catch (IOException e) {
@ -95,8 +115,8 @@ public class RedactionStorageService {
public static class StorageIdUtils { public static class StorageIdUtils {
public static String getStorageId(String projectId, String fileId, FileType fileType) { public static String getStorageId(String dossierId, String fileId, FileType fileType) {
return projectId + "/" + fileId + "." + fileType.name() + fileType.getExtension(); return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
} }
} }

View File

@ -2,6 +2,8 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
@ -18,6 +20,8 @@ public abstract class AbstractTextContainer {
protected String classification; protected String classification;
protected int page; protected int page;
private Orientation orientation = Orientation.NONE;
public abstract String getText(); public abstract String getText();
public boolean contains(AbstractTextContainer other) { public boolean contains(AbstractTextContainer other) {

View File

@ -246,8 +246,12 @@ public class Ruling extends Line2D.Float {
public Ruling expand(float amount) { public Ruling expand(float amount) {
Ruling r = (Ruling) this.clone(); Ruling r = (Ruling) this.clone();
try {
r.setStart(this.getStart() - amount); r.setStart(this.getStart() - amount);
r.setEnd(this.getEnd() + amount); r.setEnd(this.getEnd() + amount);
} catch (UnsupportedOperationException e){
log.warn("Could not expand ruling!");
}
return r; return r;
} }

View File

@ -102,7 +102,7 @@ public class PdfVisualisationService {
contentStream.newLineAtOffset(textBlock.getMinX(), textBlock.getMaxY()); contentStream.newLineAtOffset(textBlock.getMinX(), textBlock.getMaxY());
contentStream.showText(textBlock.getClassification()); contentStream.showText(textBlock.getClassification() + textBlock.getOrientation());
contentStream.endText(); contentStream.endText();
} }

View File

@ -1,9 +1,9 @@
info: info:
description: Redaction Service Server V1 description: Redaction Service Server V1
configuration-service.url: "http://configuration-service-v1:8080" persistence-service.url: "http://persistence-service-v1:8080"
file-management-service.url: "http://file-management-service-v1:8080"
image-service.url: "http://image-service-v1:8080" image-service.url: "http://image-service-v1:8080"
entity-recognition-service.url: "http://entity-recognition-service-v1:8080"
server: server:
port: 8080 port: 8080

View File

@ -2,9 +2,11 @@ package com.iqser.red.service.redaction.v1.server;
import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.configuration.v1.api.model.*; import com.iqser.red.service.persistence.service.v1.api.model.FileType;
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource; import com.iqser.red.service.persistence.service.v1.api.model.JSONPrimitive;
import com.iqser.red.service.file.management.v1.api.model.FileType; import com.iqser.red.service.persistence.service.v1.api.model.data.configuration.Colors;
import com.iqser.red.service.persistence.service.v1.api.model.data.configuration.DictionaryEntry;
import com.iqser.red.service.persistence.service.v1.api.model.data.configuration.Type;
import com.iqser.red.service.redaction.v1.model.*; import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
@ -13,12 +15,14 @@ import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController; import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService; import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
@ -84,12 +88,11 @@ public class RedactionIntegrationTest {
private static final String PII = "PII"; private static final String PII = "PII";
@Autowired @Autowired
private RedactionController redactionController; private RedactionController redactionController;
@Autowired @Autowired
private ReanalyzeService reanalyzeService; private AnalyzeService analyzeService;
@Autowired @Autowired
private ObjectMapper objectMapper; private ObjectMapper objectMapper;
@ -127,9 +130,10 @@ public class RedactionIntegrationTest {
private final Map<String, Integer> rankTypeMap = new HashMap<>(); private final Map<String, Integer> rankTypeMap = new HashMap<>();
private final Colors colors = new Colors(); private final Colors colors = new Colors();
private final Map<String, Long> reanlysisVersions = new HashMap<>(); private final Map<String, Long> reanlysisVersions = new HashMap<>();
private final Set<String> deleted = new HashSet<>();
private final static String TEST_RULESET_ID = "123"; private final static String TEST_DOSSIER_TEMPLATE_ID = "123";
private final static String TEST_PROJECT_ID = "123"; private final static String TEST_DOSSIER_ID = "123";
private final static String TEST_FILE_ID = "123"; private final static String TEST_FILE_ID = "123";
@Configuration @Configuration
@ -152,18 +156,20 @@ public class RedactionIntegrationTest {
return kieServices.newKieContainer(kieModule.getReleaseId()); return kieServices.newKieContainer(kieModule.getReleaseId());
} }
@Bean @Bean
@Primary @Primary
public StorageService inmemoryStorage() { public StorageService inmemoryStorage() {
return new FileSystemBackedStorageService(); return new FileSystemBackedStorageService();
} }
} }
@After @After
public void cleanupStorage() { public void cleanupStorage() {
if (this.storageService instanceof FileSystemBackedStorageService) { if (this.storageService instanceof FileSystemBackedStorageService) {
((FileSystemBackedStorageService) this.storageService).clearStorage(); ((FileSystemBackedStorageService) this.storageService).clearStorage();
} }
@ -173,51 +179,64 @@ public class RedactionIntegrationTest {
@Before @Before
public void stubClients() { public void stubClients() {
when(rulesClient.getVersion(TEST_RULESET_ID)).thenReturn(0L); when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(0L);
when(rulesClient.getRules(TEST_RULESET_ID)).thenReturn(new RulesResponse(RULES)); when(rulesClient.getRules(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(JSONPrimitive.of(RULES));
loadDictionaryForTest(); loadDictionaryForTest();
loadTypeForTest(); loadTypeForTest();
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(0L); when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(0L);
when(dictionaryClient.getAllTypes(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(TypeResponse.builder() when(dictionaryClient.getAllTypesForDossierTemplate(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(getTypeResponse());
.types(getTypeResponse())
.build());
when(dictionaryClient.getVersion(TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(0L); when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(0L);
when(dictionaryClient.getAllTypes(TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(TypeResponse.builder() when(dictionaryClient.getAllTypesForDossier(TEST_DOSSIER_ID)).thenReturn(List.of(Type.builder()
.types(List.of(TypeResult.builder() .id(DOSSIER_REDACTIONS +":"+ TEST_DOSSIER_TEMPLATE_ID)
.type(DOSSIER_REDACTIONS) .type(DOSSIER_REDACTIONS)
.ruleSetId(TEST_RULESET_ID) .dossierTemplateId(TEST_DOSSIER_ID)
.hexColor( "#ffe187") .hexColor("#ffe187")
.isHint(hintTypeMap.get(DOSSIER_REDACTIONS)) .isHint(hintTypeMap.get(DOSSIER_REDACTIONS))
.isCaseInsensitive(caseInSensitiveMap.get(DOSSIER_REDACTIONS)) .isCaseInsensitive(caseInSensitiveMap.get(DOSSIER_REDACTIONS))
.isRecommendation(recommendationTypeMap.get(DOSSIER_REDACTIONS)) .isRecommendation(recommendationTypeMap.get(DOSSIER_REDACTIONS))
.rank(rankTypeMap.get(DOSSIER_REDACTIONS)) .rank(rankTypeMap.get(DOSSIER_REDACTIONS))
.build())) .build()));
.build());
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false)); when(dictionaryClient.getDictionaryForType(VERTEBRATE + ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(ADDRESS, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(ADDRESS, false)); .thenReturn(getDictionaryResponse(VERTEBRATE, false));
when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(AUTHOR, false)); when(dictionaryClient.getDictionaryForType(ADDRESS+ ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(SPONSOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SPONSOR, false)); .thenReturn(getDictionaryResponse(ADDRESS, false));
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR, false)); when(dictionaryClient.getDictionaryForType(AUTHOR+ ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR, false)); .thenReturn(getDictionaryResponse(AUTHOR, false));
when(dictionaryClient.getDictionaryForType(HINT_ONLY, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(HINT_ONLY, false)); when(dictionaryClient.getDictionaryForType(SPONSOR+ ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(MUST_REDACT, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(MUST_REDACT, false)); .thenReturn(getDictionaryResponse(SPONSOR, false));
when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION, false)); when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR+ ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(TEST_METHOD, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(TEST_METHOD, false)); .thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR, false));
when(dictionaryClient.getDictionaryForType(PII, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PII, false)); when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR+ ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR, false)); .thenReturn(getDictionaryResponse(REDACTION_INDICATOR, false));
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS, false)); when(dictionaryClient.getDictionaryForType(HINT_ONLY+ ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FALSE_POSITIVE, false)); .thenReturn(getDictionaryResponse(HINT_ONLY, false));
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PURITY, false)); when(dictionaryClient.getDictionaryForType(MUST_REDACT+ ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(IMAGE, false)); .thenReturn(getDictionaryResponse(MUST_REDACT, false));
when(dictionaryClient.getDictionaryForType(OCR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(OCR, false)); when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION+ ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(LOGO, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(LOGO, false)); .thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION, false));
when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SIGNATURE, false)); when(dictionaryClient.getDictionaryForType(TEST_METHOD+ ":" + TEST_DOSSIER_TEMPLATE_ID))
when(dictionaryClient.getDictionaryForType(FORMULA, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FORMULA, false)); .thenReturn(getDictionaryResponse(TEST_METHOD, false));
when(dictionaryClient.getDictionaryForType(DOSSIER_REDACTIONS, TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(getDictionaryResponse(DOSSIER_REDACTIONS, true)); when(dictionaryClient.getDictionaryForType(PII+ ":" + TEST_DOSSIER_TEMPLATE_ID)).thenReturn(getDictionaryResponse(PII, false));
when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors); when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR+ ":" + TEST_DOSSIER_TEMPLATE_ID))
.thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR, false));
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS+ ":" + TEST_DOSSIER_TEMPLATE_ID))
.thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS, false));
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE+ ":" + TEST_DOSSIER_TEMPLATE_ID))
.thenReturn(getDictionaryResponse(FALSE_POSITIVE, false));
when(dictionaryClient.getDictionaryForType(PURITY+ ":" + TEST_DOSSIER_TEMPLATE_ID))
.thenReturn(getDictionaryResponse(PURITY, false));
when(dictionaryClient.getDictionaryForType(IMAGE+ ":" + TEST_DOSSIER_TEMPLATE_ID)).thenReturn(getDictionaryResponse(IMAGE, false));
when(dictionaryClient.getDictionaryForType(OCR+ ":" + TEST_DOSSIER_TEMPLATE_ID)).thenReturn(getDictionaryResponse(OCR, false));
when(dictionaryClient.getDictionaryForType(LOGO+ ":" + TEST_DOSSIER_TEMPLATE_ID)).thenReturn(getDictionaryResponse(LOGO, false));
when(dictionaryClient.getDictionaryForType(SIGNATURE+ ":" + TEST_DOSSIER_TEMPLATE_ID))
.thenReturn(getDictionaryResponse(SIGNATURE, false));
when(dictionaryClient.getDictionaryForType(FORMULA+ ":" + TEST_DOSSIER_TEMPLATE_ID))
.thenReturn(getDictionaryResponse(FORMULA, false));
when(dictionaryClient.getDictionaryForType(DOSSIER_REDACTIONS+ ":" + TEST_DOSSIER_TEMPLATE_ID)).thenReturn(getDictionaryResponse(DOSSIER_REDACTIONS, true));
when(dictionaryClient.getColors(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(colors);
} }
@ -455,13 +474,14 @@ public class RedactionIntegrationTest {
} }
private List<TypeResult> getTypeResponse() { private List<Type> getTypeResponse() {
return typeColorMap.entrySet() return typeColorMap.entrySet()
.stream() .stream()
.map(typeColor -> TypeResult.builder() .map(typeColor -> Type.builder()
.id(typeColor.getKey() + ":" + TEST_DOSSIER_TEMPLATE_ID)
.type(typeColor.getKey()) .type(typeColor.getKey())
.ruleSetId(TEST_RULESET_ID) .dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID)
.hexColor(typeColor.getValue()) .hexColor(typeColor.getValue())
.isHint(hintTypeMap.get(typeColor.getKey())) .isHint(hintTypeMap.get(typeColor.getKey()))
.isCaseInsensitive(caseInSensitiveMap.get(typeColor.getKey())) .isCaseInsensitive(caseInSensitiveMap.get(typeColor.getKey()))
@ -473,11 +493,13 @@ public class RedactionIntegrationTest {
} }
private DictionaryResponse getDictionaryResponse(String type, boolean isDossierDictionary) { private Type getDictionaryResponse(String type, boolean isDossierDictionary) {
return DictionaryResponse.builder() return Type.builder()
.id(type + ":" +TEST_DOSSIER_TEMPLATE_ID)
.hexColor(typeColorMap.get(type)) .hexColor(typeColorMap.get(type))
.entries(isDossierDictionary ? toDictionaryEntry(dossierDictionary.get(type)) : toDictionaryEntry(dictionary.get(type))) .entries(isDossierDictionary ? toDictionaryEntry(dossierDictionary.get(type)) : toDictionaryEntry(dictionary
.get(type)))
.isHint(hintTypeMap.get(type)) .isHint(hintTypeMap.get(type))
.isCaseInsensitive(caseInSensitiveMap.get(type)) .isCaseInsensitive(caseInSensitiveMap.get(type))
.isRecommendation(recommendationTypeMap.get(type)) .isRecommendation(recommendationTypeMap.get(type))
@ -490,7 +512,11 @@ public class RedactionIntegrationTest {
List<DictionaryEntry> dictionaryEntries = new ArrayList<>(); List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
entries.forEach(entry -> { entries.forEach(entry -> {
dictionaryEntries.add(new DictionaryEntry(entry, reanlysisVersions.containsKey(entry) ? reanlysisVersions.get(entry) : 0L, false)); dictionaryEntries.add(DictionaryEntry.builder()
.value(entry)
.version(reanlysisVersions.containsKey(entry) ? reanlysisVersions.get(entry) : 0L)
.deleted(deleted
.contains(entry) ? true : false).build());
}); });
return dictionaryEntries; return dictionaryEntries;
} }
@ -498,9 +524,11 @@ public class RedactionIntegrationTest {
@Test @Test
public void test270Rotated() { public void test270Rotated() {
AnalyzeRequest request = prepareStorage("files/Minimal Examples/270Rotated.pdf"); AnalyzeRequest request = prepareStorage("files/Minimal Examples/270Rotated.pdf");
MemoryStats.printMemoryStats(); MemoryStats.printMemoryStats();
AnalyzeResult result = reanalyzeService.analyze(request); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
assertThat(result).isNotNull(); assertThat(result).isNotNull();
} }
@ -508,12 +536,15 @@ public class RedactionIntegrationTest {
@Test @Test
@Ignore @Ignore
public void testLargeScannedFileOOM() { public void testLargeScannedFileOOM() {
AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf"); AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf");
MemoryStats.printMemoryStats(); MemoryStats.printMemoryStats();
AnalyzeResult result = reanalyzeService.analyze(request); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
assertThat(result).isNotNull(); assertThat(result).isNotNull();
} }
@Test @Test
public void testMergedImages() throws IOException { public void testMergedImages() throws IOException {
@ -521,11 +552,12 @@ public class RedactionIntegrationTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/merge_images.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/merge_images.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>(); Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID); var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> { redactionLog.getRedactionLogEntry().forEach(entry -> {
duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry); duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry);
@ -536,10 +568,10 @@ public class RedactionIntegrationTest {
}); });
dictionary.get(AUTHOR).add("Drinking water"); dictionary.get(AUTHOR).add("Drinking water");
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(1L); when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(1L);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.projectId(TEST_PROJECT_ID) .dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID) .fileId(TEST_FILE_ID)
.build()); .build());
@ -547,19 +579,18 @@ public class RedactionIntegrationTest {
fileOutputStream.write(annotateResponse.getDocument()); fileOutputStream.write(annotateResponse.getDocument());
} }
long rstart = System.currentTimeMillis(); long rstart = System.currentTimeMillis();
reanalyzeService.reanalyze(request); analyzeService.reanalyze(request);
long rend = System.currentTimeMillis(); long rend = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (rend - rstart)); System.out.println("reanalysis analysis duration: " + (rend - rstart));
long end = System.currentTimeMillis(); long end = System.currentTimeMillis();
System.out.println("duration: " + (end - start)); System.out.println("duration: " + (end - start));
} }
@Test @Test
@Ignore @Ignore
public void noExceptionShouldBeThrownForAnyFiles() throws IOException { public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
@ -577,11 +608,15 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage(new FileInputStream((path))); AnalyzeRequest request = prepareStorage(new FileInputStream((path)));
System.out.println("Redacting file : " + path.getName()); System.out.println("Redacting file : " + path.getName());
AnalyzeResult result = reanalyzeService.analyze(request); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
long fstart = System.currentTimeMillis();
AnalyzeResult result = analyzeService.analyze(request);
System.out.println("analysis analysis duration: " + (System.currentTimeMillis() - fstart));
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>(); Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID); var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> { redactionLog.getRedactionLogEntry().forEach(entry -> {
duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry); duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry);
@ -592,10 +627,10 @@ public class RedactionIntegrationTest {
}); });
dictionary.get(AUTHOR).add("Drinking water"); dictionary.get(AUTHOR).add("Drinking water");
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(1L); when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(1L);
long rstart = System.currentTimeMillis(); long rstart = System.currentTimeMillis();
reanalyzeService.reanalyze(request); analyzeService.reanalyze(request);
long rend = System.currentTimeMillis(); long rend = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (rend - rstart)); System.out.println("reanalysis analysis duration: " + (rend - rstart));
@ -631,13 +666,22 @@ public class RedactionIntegrationTest {
public void redactionTest() throws IOException { public void redactionTest() throws IOException {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/S11.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setExcludedPages(Set.of(1));
AnalyzeResult result = reanalyzeService.analyze(request); request.setFileAttributes(List.of(FileAttribute.builder()
.id("fileAttributeId")
.label("Vertebrate Study")
.placeholder("{fileattributes.vertebrateStudy}")
.value("true")
.build()));
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
var text = redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID); AnalyzeResult result = analyzeService.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var text = redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> { redactionLog.getRedactionLogEntry().forEach(entry -> {
if (entry.isImage()) { if (entry.isImage()) {
@ -650,7 +694,7 @@ public class RedactionIntegrationTest {
System.out.println("first analysis duration: " + (end - start)); System.out.println("first analysis duration: " + (end - start));
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) { try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) {
fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID))); fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID)));
} }
int correctFound = 0; int correctFound = 0;
@ -680,21 +724,43 @@ public class RedactionIntegrationTest {
dictionary.get(AUTHOR).add("physical"); dictionary.get(AUTHOR).add("physical");
reanlysisVersions.put("physical", 2L); reanlysisVersions.put("physical", 2L);
dictionary.get(VERTEBRATE).add("s-metolachlor"); deleted.add("David Chubb");
reanlysisVersions.put("s-metolachlor", 3L); deleted.add("mouse");
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(3L); dictionary.get(FALSE_POSITIVE).add("David Chubb");
reanlysisVersions.put("David Chubb", 3L);
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false)); reanlysisVersions.put("mouse", 3L);
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(3L);
when(dictionaryClient.getDictionaryForType(VERTEBRATE))
.thenReturn(getDictionaryResponse(VERTEBRATE, false));
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE))
.thenReturn(getDictionaryResponse(FALSE_POSITIVE, false));
start = System.currentTimeMillis(); start = System.currentTimeMillis();
AnalyzeResult reanalyzeResult = reanalyzeService.reanalyze(request);
ManualRedactions manualRedactions = new ManualRedactions();
manualRedactions.setImageRecategorizations(Set.of(ManualImageRecategorization.builder()
.id("37eee3e9d589a5cc529bfec38c3ba479")
.status(Status.APPROVED)
.type("signature")
.build()));
request.setManualRedactions(manualRedactions);
AnalyzeResult reanalyzeResult = analyzeService.reanalyze(request);
redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
end = System.currentTimeMillis(); end = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (end - start)); System.out.println("reanalysis analysis duration: " + (end - start));
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.projectId(TEST_PROJECT_ID) .dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID) .fileId(TEST_FILE_ID)
.build()); .build());
@ -702,6 +768,20 @@ public class RedactionIntegrationTest {
fileOutputStream.write(annotateResponse.getDocument()); fileOutputStream.write(annotateResponse.getDocument());
} }
deleted.remove("mouse");
reanlysisVersions.put("mouse", 4L);
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(4L);
when(dictionaryClient.getDictionaryForType(VERTEBRATE))
.thenReturn(getDictionaryResponse(VERTEBRATE, false));
analyzeService.reanalyze(request);
redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
System.out.println("hi");
} }
@ -712,10 +792,11 @@ public class RedactionIntegrationTest {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
AnalyzeResult result = reanalyzeService.analyze(request); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.projectId(TEST_PROJECT_ID) .dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID) .fileId(TEST_FILE_ID)
.build()); .build());
@ -771,26 +852,31 @@ public class RedactionIntegrationTest {
// manualRedactions.getEntriesToAdd().add(manualRedactionEntry); // manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setManualRedactions(manualRedactions); request.setManualRedactions(manualRedactions);
AnalyzeResult result = reanalyzeService.analyze(request); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
manualRedactions.getEntriesToAdd().add(manualRedactionEntry); manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
manualRedactions.setIdsToRemove(Set.of(IdRemoval.builder() manualRedactions.setIdsToRemove(Set.of(IdRemoval.builder()
.id("5b940b2cb401ed9f5be6fc24f6e77bcf") .id("5b940b2cb401ed9f5be6fc24f6e77bcf")
.status(Status.APPROVED) .status(Status.APPROVED)
.build())); .build()));
manualRedactions.setManualLegalBasisChanges(Set.of(ManualLegalBasisChange.builder()
.id("675eba69b0c2917de55462c817adaa05")
.legalBasis("Manual Legal Basis Change")
.status(Status.APPROVED)
.build()));
reanalyzeService.reanalyze(request); analyzeService.reanalyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.projectId(TEST_PROJECT_ID) .dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID) .fileId(TEST_FILE_ID)
.build()); .build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) { try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument()); fileOutputStream.write(annotateResponse.getDocument());
} }
@ -805,15 +891,14 @@ public class RedactionIntegrationTest {
public void classificationTest() throws IOException { public void classificationTest() throws IOException {
System.out.println("classificationTest"); System.out.println("classificationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/S11.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder() RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId()) .dossierId(request.getDossierId())
.fileId(request.getFileId()) .fileId(request.getFileId())
.ruleSetId(request.getRuleSetId()) .dossierTemplateId(request.getDossierTemplateId())
.build(); .build();
RedactionResult result = redactionController.classify(redactionRequest); RedactionResult result = redactionController.classify(redactionRequest);
@ -828,14 +913,14 @@ public class RedactionIntegrationTest {
public void sectionsTest() throws IOException { public void sectionsTest() throws IOException {
System.out.println("sectionsTest"); System.out.println("sectionsTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/S11.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder() RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId()) .dossierId(request.getDossierId())
.fileId(request.getFileId()) .fileId(request.getFileId())
.ruleSetId(request.getRuleSetId()) .dossierTemplateId(request.getDossierTemplateId())
.build(); .build();
RedactionResult result = redactionController.sections(redactionRequest); RedactionResult result = redactionController.sections(redactionRequest);
@ -850,14 +935,14 @@ public class RedactionIntegrationTest {
public void htmlTablesTest() throws IOException { public void htmlTablesTest() throws IOException {
System.out.println("htmlTablesTest"); System.out.println("htmlTablesTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/S11.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder() RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId()) .dossierId(request.getDossierId())
.fileId(request.getFileId()) .fileId(request.getFileId())
.ruleSetId(request.getRuleSetId()) .dossierTemplateId(request.getDossierTemplateId())
.build(); .build();
RedactionResult result = redactionController.htmlTables(redactionRequest); RedactionResult result = redactionController.htmlTables(redactionRequest);
@ -877,9 +962,9 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder() RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId()) .dossierId(request.getDossierId())
.fileId(request.getFileId()) .fileId(request.getFileId())
.ruleSetId(request.getRuleSetId()) .dossierTemplateId(request.getDossierTemplateId())
.build(); .build();
RedactionResult result = redactionController.htmlTables(redactionRequest); RedactionResult result = redactionController.htmlTables(redactionRequest);
@ -897,9 +982,10 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID); var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> { redactionLog.getRedactionLogEntry().forEach(entry -> {
if (!entry.isHint()) { if (!entry.isHint()) {
@ -908,8 +994,10 @@ public class RedactionIntegrationTest {
}); });
} }
@SneakyThrows @SneakyThrows
private AnalyzeRequest prepareStorage(String file) { private AnalyzeRequest prepareStorage(String file) {
ClassPathResource pdfFileResource = new ClassPathResource(file); ClassPathResource pdfFileResource = new ClassPathResource(file);
return prepareStorage(pdfFileResource.getInputStream()); return prepareStorage(pdfFileResource.getInputStream());
@ -920,15 +1008,15 @@ public class RedactionIntegrationTest {
private AnalyzeRequest prepareStorage(InputStream stream) { private AnalyzeRequest prepareStorage(InputStream stream) {
AnalyzeRequest request = AnalyzeRequest.builder() AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID) .dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID)
.projectId(TEST_PROJECT_ID) .dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID) .fileId(TEST_FILE_ID)
.lastProcessed(OffsetDateTime.now()) .lastProcessed(OffsetDateTime.now())
.build(); .build();
var bytes = IOUtils.toByteArray(stream); var bytes = IOUtils.toByteArray(stream);
storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_PROJECT_ID, TEST_FILE_ID, FileType.ORIGIN), bytes); storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.ORIGIN), bytes);
return request; return request;
@ -941,13 +1029,13 @@ public class RedactionIntegrationTest {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/sponsor_companies.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/sponsor_companies.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.projectId(TEST_PROJECT_ID) .dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID) .fileId(TEST_FILE_ID)
.build()); .build());

View File

@ -0,0 +1,18 @@
package com.iqser.red.service.redaction.v1.server.redaction.rulebuilder;
import com.iqser.red.service.redaction.v1.model.RuleBuilderModel;
import org.junit.Test;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
public class RuleBuilderModelServiceTest {
@Test
public void testRuleBuilderModelProvider() {
RuleBuilderModel model = new RuleBuilderModelService().getRuleBuilderModel();
assertThat(model.getWhenClauses().size()).isGreaterThan(1);
assertThat(model.getThenConditions().size()).isGreaterThan(1);
}
}

View File

@ -1,511 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.amazonaws.services.s3.AmazonS3;
import com.iqser.red.service.configuration.v1.api.model.*;
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
import com.iqser.red.service.redaction.v1.server.Application;
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.storage.commons.service.StorageService;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.KieServices;
import org.kie.api.builder.KieBuilder;
import org.kie.api.builder.KieFileSystem;
import org.kie.api.builder.KieModule;
import org.kie.api.runtime.KieContainer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
@RunWith(SpringRunner.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(EntityRedactionServiceTest.RedactionIntegrationTestConfiguration.class)
public class EntityRedactionServiceTest {
private static final String DEFAULT_RULES = loadFromClassPath("drools/rules.drl");
private static final String AUTHOR_CODE = "author";
private static final String ADDRESS_CODE = "address";
private static final String SPONSOR_CODE = "sponsor";
private static final AtomicLong DICTIONARY_VERSION = new AtomicLong();
private static final AtomicLong RULES_VERSION = new AtomicLong();
@MockBean
private DictionaryClient dictionaryClient;
@MockBean
private RulesClient rulesClient;
@Autowired
private EntityRedactionService entityRedactionService;
@Autowired
private PdfSegmentationService pdfSegmentationService;
@Autowired
private DroolsExecutionService droolsExecutionService;
@MockBean
private AmazonS3 amazonS3;
@MockBean
private LegalBasisClient legalBasisClient;
private final static String TEST_RULESET_ID = "123";
@Configuration
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
public static class RedactionIntegrationTestConfiguration {
@Bean
public KieContainer kieContainer() {
KieServices kieServices = KieServices.Factory.get();
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
InputStream input = new ByteArrayInputStream(DEFAULT_RULES.getBytes(StandardCharsets.UTF_8));
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources()
.newInputStreamResource(input));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll();
KieModule kieModule = kieBuilder.getKieModule();
return kieServices.newKieContainer(kieModule.getReleaseId());
}
@Bean
@Primary
public StorageService inmemoryStorage() {
return new FileSystemBackedStorageService();
}
}
@Test
public void testNestedEntitiesRemoval() {
Set<Entity> entities = new HashSet<>();
Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false, false);
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false, false);
entities.add(nested);
entities.add(nesting);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
assertThat(entities.size()).isEqualTo(1);
assertThat(entities).contains(nesting);
}
@Test
public void testTableRedaction() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@Test
public void testNestedRedaction() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/nested_redaction.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@Test
public void testTrueNegativesInTable() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Cyprodinil/40 Cyprodinil - EU AIR3 - LCA Section 1" +
" Supplement - Identity of the active substance - Reference list.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " +
"the plant protection product.pdf");
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
}
@Test
public void testFalsePositiveInWrongCell() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 9)
.count()).isEqualTo(10);
}
@Test
public void testApplicantInTableRedaction() throws IOException {
String tableRules = "package drools\n" +
"\n" +
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
"\n" +
"global Section section\n" +
"rule \"6: Redact contact information if applicant is found\"\n" +
" when\n" +
" eval(section.headlineContainsWord(\"applicant\") || section.getText().contains(\"Applicant\"));\n" +
" then\n" +
" section.redactLineAfter(\"Name:\", \"address\", 6,true, \"Applicant information was found\", \"Reg" +
" (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactBetween(\"Address:\", \"Contact\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Contact point:\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Phone:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Fax:\", \"address\", 6,true, \"Applicant information was found\", \"Reg " +
"(EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Tel.:\", \"address\", 6,true, \"Applicant information was found\", \"Reg" +
" (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Tel:\", \"address\", 6,true, \"Applicant information was found\", \"Reg " +
"(EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"E-mail:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Email:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Contact:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Telephone number:\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Fax number:\", \"address\", 6,true, \"Applicant information was found\"," +
" \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Telephone:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactBetween(\"No:\", \"Fax\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactBetween(\"Contact:\", \"Tel.:\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" end";
when(rulesClient.getVersion(TEST_RULESET_ID)).thenReturn(RULES_VERSION.incrementAndGet());
when(rulesClient.getRules(TEST_RULESET_ID)).thenReturn(new RulesResponse(tableRules));
droolsExecutionService.updateRules(TEST_RULESET_ID);
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 6)
.count()).isEqualTo(13);
}
@Test
public void testSponsorInCell() throws IOException {
String tableRules = "package drools\n" +
"\n" +
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
"\n" +
"global Section section\n" + "rule \"11: Redact sponsor company\"\n" + " when\n" + " " +
"Section(searchText.toLowerCase().contains(\"batches produced at\"))\n" + " then\n" + " section" +
".redactIfPrecededBy(\"batches produced at\", \"sponsor\", 11, \"Redacted because it represents a " +
"sponsor company\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + " end";
when(rulesClient.getVersion(TEST_RULESET_ID)).thenReturn(RULES_VERSION.incrementAndGet());
when(rulesClient.getRules(TEST_RULESET_ID)).thenReturn(new RulesResponse(tableRules));
droolsExecutionService.updateRules(TEST_RULESET_ID);
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/batches_new_line.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse authorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(authorResponse);
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 11)
.count()).isEqualTo(1);
}
@Test
public void headerPropagation() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Bissig R.", "Thanei P.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf");
dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Tribolet, R.", "Muir, G.", "Kühne-Thu, H.", "Close, C.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
}
@Test
@Ignore
public void testNGuideline() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Empty Tabular Data.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Aldershof S.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
}
@Before
public void stubRedaction() {
String tableRules = "package drools\n" +
"\n" +
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
"\n" +
"global Section section\n" +
"rule \"8: Not redacted because Vertebrate Study = N\"\n" +
" when\n" +
" Section(rowEquals(\"Vertebrate study Y/N\", \"N\") || rowEquals(\"Vertebrate study Y/N\", \"No\"))\n" +
" then\n" +
" section.redactNotCell(\"Author(s)\", 8, \"name\", false, \"Not redacted because row is not a vertebrate study\");\n" +
" section.redactNot(\"address\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 8, \"hint_only\");\n" +
" end\n" +
"rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" +
" when\n" +
" Section(rowEquals(\"Vertebrate study Y/N\", \"Y\") || rowEquals(\"Vertebrate study Y/N\", " +
"\"Yes\"))\n" +
" then\n" +
" section.redactCell(\"Author(s)\", 9, \"name\", false, \"Redacted because row is a vertebrate study\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\", \"Reg (EC) No" +
" 1107/2009 Art. 63 (2g)\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" +
" end";
when(rulesClient.getVersion(TEST_RULESET_ID)).thenReturn(RULES_VERSION.incrementAndGet());
when(rulesClient.getRules(TEST_RULESET_ID)).thenReturn(new RulesResponse(tableRules));
TypeResponse typeResponse = TypeResponse.builder()
.types(Arrays.asList(
TypeResult.builder().ruleSetId(TEST_RULESET_ID).type(AUTHOR_CODE).hexColor("#ffff00").build(),
TypeResult.builder().ruleSetId(TEST_RULESET_ID).type(ADDRESS_CODE).hexColor("#ff00ff").build(),
TypeResult.builder().ruleSetId(TEST_RULESET_ID).type(SPONSOR_CODE).hexColor("#00ffff").build()))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getAllTypes(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(typeResponse);
// Default empty return to prevent NPEs
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Colors colors = new Colors();
colors.setDefaultColor("#acfc00");
colors.setNotRedacted("#cccccc");
colors.setRequestAdd("#04b093");
colors.setRequestRemove("#04b093");
when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors);
}
private static String loadFromClassPath(String path) {
URL resource = ResourceLoader.class.getClassLoader().getResource(path);
if (resource == null) {
throw new IllegalArgumentException("could not load classpath resource: drools/rules.drl");
}
try (BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), StandardCharsets.UTF_8))) {
StringBuilder sb = new StringBuilder();
String str;
while ((str = br.readLine()) != null) {
sb.append(str).append("\n");
}
return sb.toString();
} catch (IOException e) {
throw new IllegalArgumentException("could not load classpath resource: " + path, e);
}
}
private List<DictionaryEntry> toDictionaryEntry(List<String> entries) {
List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
entries.forEach(entry -> {
dictionaryEntries.add(new DictionaryEntry(entry, 1L, false));
});
return dictionaryEntries;
}
}

View File

@ -0,0 +1,30 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import static org.assertj.core.api.Assertions.assertThat;
import java.util.HashSet;
import java.util.Set;
import org.junit.Test;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
public class EntitySearchUtilsTest {
@Test
public void testNestedEntitiesRemoval() {
Set<Entity> entities = new HashSet<>();
Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false, false, Engine.RULE);
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false, false, Engine.RULE);
entities.add(nested);
entities.add(nesting);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
assertThat(entities.size()).isEqualTo(1);
assertThat(entities).contains(nesting);
}
}

View File

@ -1,6 +1,6 @@
configuration-service.url: "http://configuration-service-v1:8080"
image-service.url: "http://image-service-v1:8080" image-service.url: "http://image-service-v1:8080"
file-management-service.url: "http://file-management-service-v1:8080" persistence-service.url: "http://persistence-service-v1:8080"
entity-recognition-service.url: "localhost:8080"
ribbon: ribbon:
ConnectTimeout: 600000 ConnectTimeout: 600000
@ -17,3 +17,4 @@ platform.multi-tenancy:
redaction-service: redaction-service:
enable-image-classification: false enable-image-classification: false
enable-entity-recognition: false

View File

@ -1652,3 +1652,5 @@ Zoecon Corp.
Zoecon Corp., Palo Alto, USA Zoecon Corp., Palo Alto, USA
Zyma SA Zyma SA
Zyma SA, Nyon, Switzerland Zyma SA, Nyon, Switzerland
Mambo-Tox Ltd. Biomedical Sciences Building Bassett Crescent East Southampton SO16 7PX UK
Syngenta Environmental Sciences Jealotts Hill International Research Centre Bracknell, Berkshire RG42 6EY UK

View File

@ -1676,7 +1676,6 @@ da Silva Rejane
Das R Das R
Das, R. Das, R.
Daughtry, CST Daughtry, CST
David Chubb
David Chubb|Lorraine Britton David Chubb|Lorraine Britton
David Clarke David Clarke
Davies Davies

View File

@ -236,3 +236,4 @@ No details reported
Not available Not available
Test facility Test facility
TBD TBD
David Chubb

View File

@ -56,8 +56,8 @@ rule "5: Do not redact Names and Addresses if no redaction Indicator is containe
when when
Section(matchesType("vertebrate"), matchesType("published_information")) Section(matchesType("vertebrate"), matchesType("published_information"))
then then
section.redactNot("CBI_author", 5, "Vertebrate and Published Information found"); section.redactNotAndReference("CBI_author","published_information", 5, "Vertebrate and Published Information found");
section.redactNot("CBI_address", 5, "Vertebrate and Published Information found"); section.redactNotAndReference("CBI_address","published_information", 5, "Vertebrate and Published Information found");
end end
@ -268,7 +268,7 @@ rule "18: Redact contact information if Producer is found"
rule "19: Redact AUTHOR(S)" rule "19: Redact AUTHOR(S)"
when when
Section(searchText.contains("AUTHOR(S):")) Section(searchText.contains("AUTHOR(S):") && fileAttributeByPlaceholderEquals("{fileattributes.vertebrateStudy}", "true"))
then then
section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 19, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 19, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
end end