diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index d6d0ea67..e2c7fe08 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -1,7 +1,5 @@ package buildjob; -import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask; - import com.atlassian.bamboo.specs.api.BambooSpec; import com.atlassian.bamboo.specs.api.builders.BambooKey; import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration; @@ -24,6 +22,8 @@ import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger; import com.atlassian.bamboo.specs.model.task.InjectVariablesScope; import com.atlassian.bamboo.specs.util.BambooServer; +import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask; + /** * Plan configuration for Bamboo. * Learn more on: https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs @@ -33,6 +33,8 @@ public class PlanSpec { private static final String SERVICE_NAME = "redaction-service"; + private static final String JVM_ARGS =" -Xmx4g -XX:+ExitOnOutOfMemoryError -XX:SurvivorRatio=2 -XX:NewRatio=1 -XX:InitialTenuringThreshold=16 -XX:MaxTenuringThreshold=16 -XX:InitiatingHeapOccupancyPercent=35 "; + private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", ""); /** @@ -82,9 +84,12 @@ public class PlanSpec { .checkoutItems(new CheckoutItem().defaultRepository()), new ScriptTask() .description("Build") + .environmentVariables("MAVEN_OPTS="+JVM_ARGS) .inlineBody("#!/bin/bash\n" + "set -e\n" + + "export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\"\n" + + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + diff --git a/redaction-service-v1/pom.xml b/redaction-service-v1/pom.xml index 0c8ed5ac..ee796600 100644 --- a/redaction-service-v1/pom.xml +++ b/redaction-service-v1/pom.xml @@ -5,7 +5,7 @@ platform-dependency com.iqser.red - 1.0.8 + 1.1.2 4.0.0 @@ -32,7 +32,7 @@ com.iqser.red platform-commons-dependency - 1.2.5 + 1.3.1 import pom @@ -52,4 +52,4 @@ - \ No newline at end of file + diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnalyzeRequest.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnalyzeRequest.java index 469947e6..113bdd43 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnalyzeRequest.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnalyzeRequest.java @@ -5,13 +5,20 @@ import lombok.Builder; import lombok.Data; import lombok.NoArgsConstructor; +import java.time.OffsetDateTime; + @Data @Builder @NoArgsConstructor @AllArgsConstructor public class AnalyzeRequest { - private byte[] document; + private String projectId; + private String fileId; private String ruleSetId; + private boolean reanalyseOnlyIfPossible; private ManualRedactions manualRedactions; + private OffsetDateTime lastProcessed; + } + diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnalyzeResult.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnalyzeResult.java index 56bf4b6a..b30db8cb 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnalyzeResult.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnalyzeResult.java @@ -11,9 +11,19 @@ import lombok.NoArgsConstructor; @AllArgsConstructor public class AnalyzeResult { + private String projectId; + private String fileId; + private long duration; private int numberOfPages; - private RedactionLog redactionLog; - private SectionGrid sectionGrid; - private Text text; + private boolean hasHints; + private boolean hasRequests; + private boolean hasRedactions; + private boolean hasImages; + private boolean hasUpdates; + private long dictionaryVersion; + private long dossierDictionaryVersion; + private long rulesVersion; + } + diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnnotateRequest.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnnotateRequest.java index ab9926e9..4f65d74e 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnnotateRequest.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/AnnotateRequest.java @@ -11,7 +11,6 @@ import lombok.NoArgsConstructor; @AllArgsConstructor public class AnnotateRequest { - private byte[] document; - private RedactionLog redactionLog; - private SectionGrid sectionGrid; + private String projectId; + private String fileId; } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ChangeType.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ChangeType.java new file mode 100644 index 00000000..0c902a8f --- /dev/null +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ChangeType.java @@ -0,0 +1,5 @@ +package com.iqser.red.service.redaction.v1.model; + +public enum ChangeType { + ADDED, REMOVED +} diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/Comment.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/Comment.java index f90d04d3..c45ae271 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/Comment.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/Comment.java @@ -1,12 +1,12 @@ package com.iqser.red.service.redaction.v1.model; -import java.time.OffsetDateTime; - import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; import lombok.NoArgsConstructor; +import java.time.OffsetDateTime; + @Data @Builder @AllArgsConstructor diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactionEntry.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactionEntry.java index e3303495..eb4fbecf 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactionEntry.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactionEntry.java @@ -1,13 +1,13 @@ package com.iqser.red.service.redaction.v1.model; -import java.util.ArrayList; -import java.util.List; - import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; import lombok.NoArgsConstructor; +import java.util.ArrayList; +import java.util.List; + @Data @Builder @AllArgsConstructor @@ -27,4 +27,6 @@ public class ManualRedactionEntry { private String section; private int sectionNumber; + private boolean addToDossierDictionary; + } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactions.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactions.java index ee6e0ad1..af866d09 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactions.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactions.java @@ -1,16 +1,16 @@ package com.iqser.red.service.redaction.v1.model; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - @Data @Builder @AllArgsConstructor diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionText.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionChangeLog.java similarity index 50% rename from redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionText.java rename to redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionChangeLog.java index 3c8fa02a..1270b800 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionText.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionChangeLog.java @@ -1,25 +1,22 @@ package com.iqser.red.service.redaction.v1.model; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + import java.util.ArrayList; import java.util.List; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - @Data -@Builder -@NoArgsConstructor @AllArgsConstructor -public class SectionText { +@NoArgsConstructor +public class RedactionChangeLog { - private int sectionNumber; - private String text; + private List redactionLogEntry = new ArrayList<>(); - private boolean isTable; - private String headline; + private long dictionaryVersion = -1; + private long rulesVersion = -1; - private List sectionAreas = new ArrayList<>(); + private String ruleSetId; } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionChangeLogEntry.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionChangeLogEntry.java new file mode 100644 index 00000000..3dfbacce --- /dev/null +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionChangeLogEntry.java @@ -0,0 +1,47 @@ +package com.iqser.red.service.redaction.v1.model; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.ArrayList; +import java.util.List; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class RedactionChangeLogEntry { + + private String id; + private String type; + private String value; + private String reason; + private int matchedRule; + private String legalBasis; + private boolean redacted; + private boolean isHint; + private boolean isRecommendation; + private String section; + private float[] color; + + @Builder.Default + private List positions = new ArrayList<>(); + private int sectionNumber; + private boolean manual; + private Status status; + private ManualRedactionType manualRedactionType; + private boolean isDictionaryEntry; + + private String textBefore; + private String textAfter; + + @Builder.Default + private List comments = new ArrayList<>(); + + private ChangeType changeType; + + private boolean isDossierDictionaryEntry; + +} diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLog.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLog.java index eb6203b8..b6c13b93 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLog.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLog.java @@ -1,13 +1,11 @@ package com.iqser.red.service.redaction.v1.model; -import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; import java.util.List; @Data -@AllArgsConstructor @NoArgsConstructor public class RedactionLog { @@ -17,15 +15,17 @@ public class RedactionLog { private long rulesVersion = -1; private String ruleSetId; - private String filename; + + private long dossierDictionaryVersion = -1; - public RedactionLog(List redactionLogEntry, long dictionaryVersion, long rulesVersion, String ruleSetId) { + public RedactionLog(List redactionLogEntry, long dictionaryVersion, long rulesVersion, String ruleSetId, long dossierDictionaryVersion) { this.redactionLogEntry = redactionLogEntry; this.dictionaryVersion = dictionaryVersion; this.rulesVersion = rulesVersion; this.ruleSetId = ruleSetId; + this.dossierDictionaryVersion = dossierDictionaryVersion; } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java index 2d5f5acb..6421c19e 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java @@ -1,13 +1,13 @@ package com.iqser.red.service.redaction.v1.model; -import java.util.ArrayList; -import java.util.List; - import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; import lombok.NoArgsConstructor; +import java.util.ArrayList; +import java.util.List; + @Data @Builder @NoArgsConstructor @@ -45,4 +45,6 @@ public class RedactionLogEntry { private boolean isImage; + private boolean isDossierDictionaryEntry; + } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionRequest.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionRequest.java index ab419184..fd525887 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionRequest.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionRequest.java @@ -11,7 +11,8 @@ import lombok.NoArgsConstructor; @AllArgsConstructor public class RedactionRequest { - private byte[] document; + private String projectId; + private String fileId; private String ruleSetId; private ManualRedactions manualRedactions; } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionResult.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionResult.java index 398f9fa5..80650eab 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionResult.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionResult.java @@ -13,7 +13,5 @@ public class RedactionResult { private byte[] document; private int numberOfPages; - private RedactionLog redactionLog; - private SectionGrid sectionGrid; } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RenalyzeRequest.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RenalyzeRequest.java deleted file mode 100644 index e11fee5d..00000000 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RenalyzeRequest.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.iqser.red.service.redaction.v1.model; - -import java.time.OffsetDateTime; - -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class RenalyzeRequest { - - private byte[] document; - private String ruleSetId; - private ManualRedactions manualRedactions; - private Text text; - private RedactionLog redactionLog; - private OffsetDateTime lastProcessed; -} diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionArea.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionArea.java index 3e02dce8..07e67c9f 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionArea.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionArea.java @@ -27,7 +27,7 @@ public class SectionArea { private String header; public boolean contains(Rectangle other) { - return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight(); + return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight(); } } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionGrid.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionGrid.java index ea5acb95..362b5c5c 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionGrid.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionGrid.java @@ -1,13 +1,13 @@ package com.iqser.red.service.redaction.v1.model; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + @Data @AllArgsConstructor @NoArgsConstructor diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionRectangle.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionRectangle.java index b96e3572..38031b36 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionRectangle.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionRectangle.java @@ -1,13 +1,13 @@ package com.iqser.red.service.redaction.v1.model; -import java.util.List; - import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; import lombok.NonNull; import lombok.RequiredArgsConstructor; +import java.util.List; + @Data @AllArgsConstructor @NoArgsConstructor diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/resources/RedactionResource.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/resources/RedactionResource.java index b58dcd9c..de766ba5 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/resources/RedactionResource.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/resources/RedactionResource.java @@ -1,14 +1,6 @@ package com.iqser.red.service.redaction.v1.resources; -import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; -import com.iqser.red.service.redaction.v1.model.AnalyzeResult; -import com.iqser.red.service.redaction.v1.model.AnnotateRequest; -import com.iqser.red.service.redaction.v1.model.AnnotateResponse; -import com.iqser.red.service.redaction.v1.model.ReanalyzeResult; -import com.iqser.red.service.redaction.v1.model.RedactionRequest; -import com.iqser.red.service.redaction.v1.model.RedactionResult; -import com.iqser.red.service.redaction.v1.model.RenalyzeRequest; - +import com.iqser.red.service.redaction.v1.model.*; import org.springframework.http.MediaType; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.PostMapping; @@ -21,11 +13,6 @@ public interface RedactionResource { String RULE_SET_PARAMETER_NAME = "ruleSetId"; String RULE_SET_PATH_VARIABLE = "/{" + RULE_SET_PARAMETER_NAME + "}"; - @PostMapping(value = "/analyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) - AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest); - - @PostMapping(value = "/reanalyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) - ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest); @PostMapping(value = "/annotate", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest); @@ -39,10 +26,10 @@ public interface RedactionResource { @PostMapping(value = "/debug/htmlTables", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest); - @PostMapping(value = "/rules/update"+RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE) + @PostMapping(value = "/rules/update" + RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE) void updateRules(@PathVariable(RULE_SET_PARAMETER_NAME) String ruleSetId); @PostMapping(value = "/rules/test", consumes = MediaType.APPLICATION_JSON_VALUE) void testRules(@RequestBody String rules); -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/pom.xml b/redaction-service-v1/redaction-service-server-v1/pom.xml index 31788d3a..1975a136 100644 --- a/redaction-service-v1/redaction-service-server-v1/pom.xml +++ b/redaction-service-v1/redaction-service-server-v1/pom.xml @@ -12,6 +12,10 @@ redaction-service-server-v1 + + com.iqser.red.commons + storage-commons + com.iqser.red.service redaction-service-api-v1 @@ -20,7 +24,18 @@ com.iqser.red.service configuration-service-api-v1 - 2.2.9 + 2.5.6 + + + com.iqser.red.service + file-management-service-api-v1 + 2.7.4 + + + com.iqser.red.service + redaction-service-api-v1 + + org.drools @@ -74,6 +89,12 @@ spring-cloud-starter-openfeign + + org.springframework.boot + spring-boot-starter-amqp + 2.3.1.RELEASE + + org.springframework.boot @@ -86,9 +107,9 @@ test - junit - junit - 4.12 + org.springframework.amqp + spring-rabbit-test + 2.3.1 test diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java index f591b03f..58e2d815 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java @@ -1,5 +1,8 @@ package com.iqser.red.service.redaction.v1.server; +import com.iqser.red.commons.spring.DefaultWebMvcConfiguration; +import com.iqser.red.service.redaction.v1.server.client.RulesClient; +import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; import org.springframework.boot.SpringApplication; import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration; import org.springframework.boot.autoconfigure.SpringBootApplication; @@ -8,10 +11,6 @@ import org.springframework.boot.context.properties.EnableConfigurationProperties import org.springframework.cloud.openfeign.EnableFeignClients; import org.springframework.context.annotation.Import; -import com.iqser.red.commons.spring.DefaultWebMvcConfiguration; -import com.iqser.red.service.redaction.v1.server.client.RulesClient; -import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; - @Import({DefaultWebMvcConfiguration.class}) @EnableFeignClients(basePackageClasses = RulesClient.class) @EnableConfigurationProperties(RedactionServiceSettings.class) @@ -19,8 +18,9 @@ import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettin public class Application { public static void main(String[] args) { + System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true"); SpringApplication.run(Application.class, args); } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java index 0fb41529..d312bd8c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java @@ -1,20 +1,19 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; +import com.iqser.red.service.redaction.v1.model.SectionGrid; +import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; +import lombok.Data; +import lombok.NoArgsConstructor; + import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; -import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; -import com.iqser.red.service.redaction.v1.model.SectionGrid; -import com.iqser.red.service.redaction.v1.model.SectionText; -import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; -import com.iqser.red.service.redaction.v1.server.redaction.model.Image; - -import lombok.Data; -import lombok.NoArgsConstructor; - @Data @NoArgsConstructor public class Document { @@ -33,7 +32,7 @@ public class Document { private List redactionLogEntities = new ArrayList<>(); private SectionGrid sectionGrid = new SectionGrid(); - private long dictionaryVersion; + private DictionaryVersion dictionaryVersion; private long rulesVersion; private List sectionText = new ArrayList<>(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/FloatFrequencyCounter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/FloatFrequencyCounter.java index 6828bd38..c232339b 100755 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/FloatFrequencyCounter.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/FloatFrequencyCounter.java @@ -1,5 +1,7 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import lombok.Getter; + import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -7,38 +9,35 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import lombok.Getter; - -public class FloatFrequencyCounter -{ +public class FloatFrequencyCounter { @Getter Map countPerValue = new HashMap<>(); - public void add(float value){ - if(!countPerValue.containsKey(value)){ + public void add(float value) { + if (!countPerValue.containsKey(value)) { countPerValue.put(value, 1); } else { countPerValue.put(value, countPerValue.get(value) + 1); } } - public void addAll(Map otherCounter){ - for(Map.Entry entry: otherCounter.entrySet()){ - if(countPerValue.containsKey(entry.getKey())){ - countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey())+ entry.getValue()); + public void addAll(Map otherCounter) { + for (Map.Entry entry : otherCounter.entrySet()) { + if (countPerValue.containsKey(entry.getKey())) { + countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue()); } else { countPerValue.put(entry.getKey(), entry.getValue()); } } } - public Float getMostPopular(){ + public Float getMostPopular() { Map.Entry mostPopular = null; - for(Map.Entry entry: countPerValue.entrySet()){ - if(mostPopular == null){ + for (Map.Entry entry : countPerValue.entrySet()) { + if (mostPopular == null) { mostPopular = entry; - } else if(entry.getValue() >= mostPopular.getValue()){ + } else if (entry.getValue() >= mostPopular.getValue()) { mostPopular = entry; } } @@ -46,12 +45,11 @@ public class FloatFrequencyCounter } - - public List getHighterThanMostPopular(){ + public List getHighterThanMostPopular() { Float mostPopular = getMostPopular(); List higher = new ArrayList<>(); - for(Float value: countPerValue.keySet()){ - if(value > mostPopular){ + for (Float value : countPerValue.keySet()) { + if (value > mostPopular) { higher.add(value); } } @@ -60,12 +58,12 @@ public class FloatFrequencyCounter } - public Float getHighest(){ + public Float getHighest() { Float highest = null; - for(Float value: countPerValue.keySet()){ - if (highest == null){ + for (Float value : countPerValue.keySet()) { + if (highest == null) { highest = value; - } else if(value > highest){ + } else if (value > highest) { highest = value; } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java index 37b691e6..b88a16b7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java @@ -1,19 +1,19 @@ package com.iqser.red.service.redaction.v1.server.classification.model; -import java.util.List; - +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; - import lombok.AllArgsConstructor; import lombok.Data; +import java.util.List; + @Data @AllArgsConstructor public class Footer { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); @@ -21,4 +21,4 @@ public class Footer { return searchableText; } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java index 8a4b67ae..133e0245 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java @@ -1,19 +1,19 @@ package com.iqser.red.service.redaction.v1.server.classification.model; -import java.util.List; - +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; - import lombok.AllArgsConstructor; import lombok.Data; +import java.util.List; + @Data @AllArgsConstructor public class Header { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); @@ -21,4 +21,4 @@ public class Header { return searchableText; } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java index af07f19f..873ae8a1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java @@ -1,15 +1,14 @@ package com.iqser.red.service.redaction.v1.server.classification.model; -import java.util.List; - import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle; - import lombok.Data; import lombok.NonNull; import lombok.RequiredArgsConstructor; +import java.util.List; + @Data @RequiredArgsConstructor public class Page { @@ -37,4 +36,4 @@ public class Page { return rotation != 0; } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java index 07e6b6fa..5a661126 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java @@ -1,19 +1,18 @@ package com.iqser.red.service.redaction.v1.server.classification.model; -import java.util.ArrayList; -import java.util.List; - import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; - import lombok.Data; import lombok.NoArgsConstructor; +import java.util.ArrayList; +import java.util.List; + @Data @NoArgsConstructor -public class Paragraph implements Comparable{ +public class Paragraph implements Comparable { private List pageBlocks = new ArrayList<>(); private List images = new ArrayList<>(); @@ -62,4 +61,4 @@ public class Paragraph implements Comparable{ return 0; } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java new file mode 100644 index 00000000..77649132 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java @@ -0,0 +1,52 @@ +package com.iqser.red.service.redaction.v1.server.classification.model; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.iqser.red.service.redaction.v1.model.SectionArea; +import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; +import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.*; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class SectionText { + + private int sectionNumber; + private String text; + + private boolean isTable; + private String headline; + + private List sectionAreas = new ArrayList<>(); + private Set images = new HashSet<>(); + + private List textBlocks = new ArrayList<>(); + private Map tabularData = new HashMap<>(); + private List cellStarts = new ArrayList<>(); + + + public void setTabularData(Map tabularData) { + tabularData.remove(null); + this.tabularData = tabularData; + } + + @JsonIgnore + public SearchableText getSearchableText() { + + SearchableText searchableText = new SearchableText(); + textBlocks.forEach(block -> { + if (block != null) { + searchableText.addAll(block.getSequences()); + } + }); + return searchableText; + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java index 0cbdfcc0..8aeb451d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java @@ -1,10 +1,10 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import lombok.Getter; + import java.util.HashMap; import java.util.Map; -import lombok.Getter; - public class StringFrequencyCounter { @Getter @@ -46,4 +46,4 @@ public class StringFrequencyCounter { return mostPopular != null ? mostPopular.getKey() : null; } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/Text.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Text.java similarity index 72% rename from redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/Text.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Text.java index a2bc00e9..4df1ace3 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/Text.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Text.java @@ -1,17 +1,18 @@ -package com.iqser.red.service.redaction.v1.model; - -import java.util.ArrayList; -import java.util.List; +package com.iqser.red.service.redaction.v1.server.classification.model; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; +import java.util.ArrayList; +import java.util.List; + @Data @NoArgsConstructor @AllArgsConstructor public class Text { + private int numberOfPages; private List sectionTexts = new ArrayList<>(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java index 0b3b253c..63cfc11c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java @@ -1,19 +1,21 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + import java.util.ArrayList; import java.util.List; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; - -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; - @AllArgsConstructor @Builder @Data +@NoArgsConstructor public class TextBlock extends AbstractTextContainer { @Builder.Default @@ -98,7 +100,6 @@ public class TextBlock extends AbstractTextContainer { } - @Override public String toString() { @@ -118,6 +119,7 @@ public class TextBlock extends AbstractTextContainer { } @Override + @JsonIgnore public String getText() { StringBuilder sb = new StringBuilder(); @@ -139,4 +141,4 @@ public class TextBlock extends AbstractTextContainer { } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java index bfe56052..0d51a4f8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java @@ -1,19 +1,19 @@ package com.iqser.red.service.redaction.v1.server.classification.model; -import java.util.List; - +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; - import lombok.AllArgsConstructor; import lombok.Data; +import java.util.List; + @Data @AllArgsConstructor public class UnclassifiedText { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); @@ -21,4 +21,4 @@ public class UnclassifiedText { return searchableText; } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java index d4b83409..4badfec4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java @@ -1,21 +1,20 @@ package com.iqser.red.service.redaction.v1.server.classification.service; -import java.util.ArrayList; -import java.util.List; - -import org.springframework.stereotype.Service; - -import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils; import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter; import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; +import org.springframework.stereotype.Service; + +import java.util.ArrayList; +import java.util.List; @Service @SuppressWarnings("all") diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java index 362ba551..1e72fd52 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java @@ -1,19 +1,17 @@ package com.iqser.red.service.redaction.v1.server.classification.service; -import java.util.List; -import java.util.regex.Pattern; - -import org.springframework.stereotype.Service; - import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle; - import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; + +import java.util.List; +import java.util.regex.Pattern; @Slf4j @Service diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/utils/PositionUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/utils/PositionUtils.java index ce421eb6..98117bbe 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/utils/PositionUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/utils/PositionUtils.java @@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.classification.utils; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle; - import lombok.experimental.UtilityClass; @UtilityClass diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/FileStatusProcessingUpdateClient.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/FileStatusProcessingUpdateClient.java new file mode 100644 index 00000000..41182a31 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/FileStatusProcessingUpdateClient.java @@ -0,0 +1,9 @@ +package com.iqser.red.service.redaction.v1.server.client; + + +import com.iqser.red.service.file.management.v1.api.resources.FileStatusProcessingUpdateResource; +import org.springframework.cloud.openfeign.FeignClient; + +@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${file-management-service.url}") +public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource { +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/MockMultipartFile.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/MockMultipartFile.java index 5dc671e4..0951ffcd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/MockMultipartFile.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/MockMultipartFile.java @@ -1,16 +1,16 @@ package com.iqser.red.service.redaction.v1.server.client; -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; - import org.springframework.lang.NonNull; import org.springframework.lang.Nullable; import org.springframework.util.Assert; import org.springframework.util.FileCopyUtils; import org.springframework.web.multipart.MultipartFile; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; + public class MockMultipartFile implements MultipartFile { private final String name; @@ -22,13 +22,13 @@ public class MockMultipartFile implements MultipartFile { public MockMultipartFile(String name, @Nullable byte[] content) { - this(name, "", (String) null, (byte[]) content); + this(name, "", null, content); } public MockMultipartFile(String name, InputStream contentStream) throws IOException { - this(name, "", (String) null, (byte[]) FileCopyUtils.copyToByteArray(contentStream)); + this(name, "", null, FileCopyUtils.copyToByteArray(contentStream)); } @@ -78,7 +78,7 @@ public class MockMultipartFile implements MultipartFile { public long getSize() { - return (long) this.content.length; + return this.content.length; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/RulesClient.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/RulesClient.java index ce41a3ce..35ffba79 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/RulesClient.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/RulesClient.java @@ -3,6 +3,6 @@ package com.iqser.red.service.redaction.v1.server.client; import com.iqser.red.service.configuration.v1.api.resource.RulesResource; import org.springframework.cloud.openfeign.FeignClient; -@FeignClient(name = RulesResource.SERVICE_NAME, url = "${configuration-service.url}") +@FeignClient(name = "RulesResource", url = "${configuration-service.url}") public interface RulesClient extends RulesResource { -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/ControllerAdvice.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/ControllerAdvice.java index fe76646e..3262faf2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/ControllerAdvice.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/ControllerAdvice.java @@ -1,17 +1,15 @@ package com.iqser.red.service.redaction.v1.server.controller; -import java.time.OffsetDateTime; - import com.iqser.red.commons.spring.ErrorMessage; +import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException; +import lombok.extern.slf4j.Slf4j; import org.springframework.http.HttpStatus; import org.springframework.web.bind.annotation.ExceptionHandler; import org.springframework.web.bind.annotation.ResponseBody; import org.springframework.web.bind.annotation.ResponseStatus; import org.springframework.web.bind.annotation.RestControllerAdvice; -import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException; - -import lombok.extern.slf4j.Slf4j; +import java.time.OffsetDateTime; @Slf4j @RestControllerAdvice @@ -38,4 +36,4 @@ public class ControllerAdvice { return new ErrorMessage(OffsetDateTime.now(), e.getMessage()); } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java index 97a152f4..31f8583b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java @@ -1,17 +1,10 @@ package com.iqser.red.service.redaction.v1.server.controller; -import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; -import com.iqser.red.service.redaction.v1.model.AnalyzeResult; +import com.iqser.red.service.file.management.v1.api.model.FileType; import com.iqser.red.service.redaction.v1.model.AnnotateRequest; import com.iqser.red.service.redaction.v1.model.AnnotateResponse; -import com.iqser.red.service.redaction.v1.model.ReanalyzeResult; -import com.iqser.red.service.redaction.v1.model.RedactionLog; -import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; import com.iqser.red.service.redaction.v1.model.RedactionRequest; import com.iqser.red.service.redaction.v1.model.RedactionResult; -import com.iqser.red.service.redaction.v1.model.RenalyzeRequest; -import com.iqser.red.service.redaction.v1.model.SectionGrid; -import com.iqser.red.service.redaction.v1.model.Text; import com.iqser.red.service.redaction.v1.resources.RedactionResource; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; @@ -19,27 +12,21 @@ import com.iqser.red.service.redaction.v1.server.exception.RedactionException; import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationService; import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService; import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService; -import com.iqser.red.service.redaction.v1.server.redaction.service.EntityRedactionService; -import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService; -import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService; -import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionLogCreatorService; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService; - import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; - +import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RestController; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.util.List; @Slf4j @RestController @@ -47,61 +34,24 @@ import java.util.List; public class RedactionController implements RedactionResource { private final PdfVisualisationService pdfVisualisationService; - private final PdfSegmentationService pdfSegmentationService; - private final RedactionLogCreatorService redactionLogCreatorService; - private final EntityRedactionService entityRedactionService; private final DroolsExecutionService droolsExecutionService; private final DictionaryService dictionaryService; private final AnnotationService annotationService; - private final ReanalyzeService reanalyzeService; - private final ImageClassificationService imageClassificationService; - - - @Override - public AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest) { - - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(analyzeRequest.getDocument()))) { - pdDocument.setAllSecurityToBeRemoved(true); - - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - - log.info("Document structure analysis successful, starting redaction analysis..."); - - imageClassificationService.classifyImages(classifiedDoc); - entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions()); - redactionLogCreatorService.createRedactionLog(classifiedDoc, pdDocument.getNumberOfPages(), analyzeRequest.getManualRedactions(), analyzeRequest - .getRuleSetId()); - - log.info("Redaction analysis successful..."); - - return AnalyzeResult.builder() - .sectionGrid(classifiedDoc.getSectionGrid()) - .redactionLog(new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion(), classifiedDoc - .getRulesVersion(), analyzeRequest.getRuleSetId())) - .numberOfPages(classifiedDoc.getPages().size()) - .text(new Text(classifiedDoc.getSectionText())) - .build(); - - } catch (Exception e) { - throw new RedactionException(e); - } - - } - - - public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) { - - return reanalyzeService.reanalyze(renalyzeRequest); - } + private final PdfSegmentationService pdfSegmentationService; + private final RedactionStorageService redactionStorageService; public AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest) { - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(annotateRequest.getDocument()))) { + var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(annotateRequest.getProjectId(), annotateRequest.getFileId(), FileType.ORIGIN)); + var redactionLog = redactionStorageService.getRedactionLog(annotateRequest.getProjectId(), annotateRequest.getFileId()); + var sectionsGrid = redactionStorageService.getSectionGrid(annotateRequest.getProjectId(), annotateRequest.getFileId()); + try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { pdDocument.setAllSecurityToBeRemoved(true); - dictionaryService.updateDictionary(annotateRequest.getRedactionLog().getRuleSetId()); - annotationService.annotate(pdDocument, annotateRequest.getRedactionLog(), annotateRequest.getSectionGrid()); + + dictionaryService.updateDictionary(redactionLog.getRuleSetId(), annotateRequest.getProjectId()); + annotationService.annotate(pdDocument, redactionLog, sectionsGrid); try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) { pdDocument.save(byteArrayOutputStream); @@ -115,65 +65,80 @@ public class RedactionController implements RedactionResource { @Override - public RedactionResult classify(@RequestBody RedactionRequest pdfSegmentationRequest) { + public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) { + var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + try { + Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(pdfSegmentationRequest.getDocument()))) { - pdDocument.setAllSecurityToBeRemoved(true); + storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) { + pdDocument.setAllSecurityToBeRemoved(true); - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument); + pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument); - return convert(pdDocument, classifiedDoc.getPages().size(), pdfSegmentationRequest.getRuleSetId()); + return convert(pdDocument, classifiedDoc.getPages().size()); + + } catch (IOException e) { + throw new RedactionException(e); + } } catch (IOException e) { throw new RedactionException(e); } + } - @Override public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) { + var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + try { + Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { - pdDocument.setAllSecurityToBeRemoved(true); + storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) { + pdDocument.setAllSecurityToBeRemoved(true); - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument); + pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument); + return convert(pdDocument, classifiedDoc.getPages().size()); - return convert(pdDocument, classifiedDoc.getPages().size(), redactionRequest.getRuleSetId()); + } catch (IOException e) { + throw new RedactionException(e); + } } catch (IOException e) { throw new RedactionException(e); } + } @Override public RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest) { - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { - pdDocument.setAllSecurityToBeRemoved(true); + Document classifiedDoc; - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - - StringBuilder sb = new StringBuilder(); - for (Page page : classifiedDoc.getPages()) { - for (AbstractTextContainer textContainer : page.getTextBlocks()) { - if (textContainer instanceof Table) { - Table table = (Table) textContainer; - sb.append(table.getTextAsHtml()).append("
").append("
"); - } - } - } - - return RedactionResult.builder().document(sb.toString().getBytes()).build(); - - } catch (IOException e) { + try { + var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true); + } catch (Exception e) { throw new RedactionException(e); } + + StringBuilder sb = new StringBuilder(); + for (Page page : classifiedDoc.getPages()) { + for (AbstractTextContainer textContainer : page.getTextBlocks()) { + if (textContainer instanceof Table) { + Table table = (Table) textContainer; + sb.append(table.getTextAsHtml()).append("
").append("
"); + } + } + } + + return RedactionResult.builder().document(sb.toString().getBytes()).build(); + } @@ -191,26 +156,17 @@ public class RedactionController implements RedactionResource { } - private RedactionResult convert(PDDocument document, int numberOfPages, String ruleSetId) throws IOException { - - return convert(document, numberOfPages, null, null, 0, 0, ruleSetId); - } - - - private RedactionResult convert(PDDocument document, int numberOfPages, - List redactionLogEntities, SectionGrid sectionGrid, - long dictionaryVersion, long rulesVersion, String ruleSetId) throws IOException { + private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException { try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) { document.save(byteArrayOutputStream); return RedactionResult.builder() .document(byteArrayOutputStream.toByteArray()) .numberOfPages(numberOfPages) - .redactionLog(new RedactionLog(redactionLogEntities, dictionaryVersion, rulesVersion, ruleSetId)) - .sectionGrid(sectionGrid) .build(); } } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/memory/MemoryStats.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/memory/MemoryStats.java new file mode 100644 index 00000000..2491d9eb --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/memory/MemoryStats.java @@ -0,0 +1,52 @@ +package com.iqser.red.service.redaction.v1.server.memory; + +import lombok.extern.slf4j.Slf4j; + +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; + +@Slf4j +public class MemoryStats { + + + public static void printMemoryStats() { + log.info("\n\n ------------------------------ \n" + + " Used Memory: " + humanReadableByteCountBin(getUsedMemory()) + "\n" + + " Free Memory: " + humanReadableByteCountBin(getFreeMemory()) + "\n" + + " Total Memory: " + humanReadableByteCountBin(getTotalMemory()) + "\n" + + " Max Memory: " + humanReadableByteCountBin(getMaxMemory()) + "\n" + + "\n ------------------------------ \n"); + } + + + public static String humanReadableByteCountBin(long bytes) { + long absB = bytes == Long.MIN_VALUE ? Long.MAX_VALUE : Math.abs(bytes); + if (absB < 1024) { + return bytes + " B"; + } + long value = absB; + CharacterIterator ci = new StringCharacterIterator("KMGTPE"); + for (int i = 40; i >= 0 && absB > 0xfffccccccccccccL >> i; i -= 10) { + value >>= 10; + ci.next(); + } + value *= Long.signum(bytes); + return String.format("%.1f %ciB", value / 1024.0, ci.current()); + } + + private static long getMaxMemory() { + return Runtime.getRuntime().maxMemory(); + } + + private static long getUsedMemory() { + return getMaxMemory() - getFreeMemory(); + } + + private static long getTotalMemory() { + return Runtime.getRuntime().totalMemory(); + } + + private static long getFreeMemory() { + return Runtime.getRuntime().freeMemory(); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java index 7e2e56c8..9b52bf7b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java @@ -1,17 +1,15 @@ package com.iqser.red.service.redaction.v1.server.parsing; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import lombok.Getter; +import lombok.Setter; +import org.apache.pdfbox.text.PDFTextStripperByArea; +import org.apache.pdfbox.text.TextPosition; + import java.io.IOException; import java.util.ArrayList; import java.util.List; -import org.apache.pdfbox.text.PDFTextStripperByArea; -import org.apache.pdfbox.text.TextPosition; - -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; - -import lombok.Getter; -import lombok.Setter; - public class PDFAreaTextStripper extends PDFTextStripperByArea { @Getter @@ -76,7 +74,7 @@ public class PDFAreaTextStripper extends PDFTextStripperByArea { } - public void clearPositions(){ + public void clearPositions() { textPositionSequences = new ArrayList<>(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java index 4b680c32..aa69cbbc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java @@ -1,33 +1,16 @@ package com.iqser.red.service.redaction.v1.server.parsing; -import java.awt.geom.AffineTransform; -import java.awt.geom.Point2D; -import java.awt.geom.Rectangle2D; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.reflect.FieldUtils; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor; -import org.apache.pdfbox.contentstream.operator.state.SetFlatness; -import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle; -import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern; -import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle; -import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit; -import org.apache.pdfbox.contentstream.operator.state.SetLineWidth; -import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent; +import org.apache.pdfbox.contentstream.operator.color.*; +import org.apache.pdfbox.contentstream.operator.state.*; import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; @@ -40,40 +23,31 @@ import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.util.Matrix; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; - -import lombok.Getter; -import lombok.Setter; -import lombok.extern.slf4j.Slf4j; +import java.awt.geom.AffineTransform; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; @Slf4j public class PDFLinesTextStripper extends PDFTextStripper { - @Setter - protected PDPage pdpage; - - @Getter - private int minCharWidth; - - @Getter - private int maxCharWidth; - - @Getter - private int minCharHeight; - - @Getter - private int maxCharHeight; - @Getter private final List textPositionSequences = new ArrayList<>(); - @Getter private final List rulings = new ArrayList<>(); - private final List graphicsPath = new ArrayList<>(); - + @Setter + protected PDPage pdpage; + @Getter + private int minCharWidth; + @Getter + private int maxCharWidth; + @Getter + private int minCharHeight; + @Getter + private int maxCharHeight; @Getter private List images = new ArrayList<>(); @@ -222,6 +196,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds .getWidth(), (float) imageBounds.getHeight()); + // Memory Hack - sofReference kills me + FieldUtils.writeField(pdfImage, "cachedImageSubsampling", -1, true); + if (rect.getHeight() > 2 && rect.getWidth() > 2) { this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber)); } @@ -369,4 +346,4 @@ public class PDFLinesTextStripper extends PDFTextStripper { } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java new file mode 100644 index 00000000..d8e72d22 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java @@ -0,0 +1,52 @@ +package com.iqser.red.service.redaction.v1.server.parsing.model; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.SneakyThrows; +import org.apache.pdfbox.text.TextPosition; +import org.springframework.beans.BeanUtils; + +@Data +@NoArgsConstructor +public class RedTextPosition { + + private String textMatrix; + private int rotation; + private float y; + private float pageHeight; + private float pageWidth; + private String unicode; + private float XDirAdj; + private float YDirAdj; + private float width; + private float heightDir; + + // not used in reanalysis + @JsonIgnore + private float widthOfSpace; + + // not used in reanalysis + @JsonIgnore + private float fontSizeInPt; + + // not used in reanalysis + @JsonIgnore + private String fontName; + + + @SneakyThrows + public static RedTextPosition fromTextPosition(TextPosition textPosition) { + var pos = new RedTextPosition(); + BeanUtils.copyProperties(textPosition, pos); + pos.setFontName(textPosition.getFont().getName()); + + pos.setFontSizeInPt(textPosition.getFontSizeInPt()); + + pos.setTextMatrix(textPosition.getTextMatrix().toString()); + + return pos; + } + + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java index dbcc1e18..10b5abb1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java @@ -1,32 +1,52 @@ package com.iqser.red.service.redaction.v1.server.parsing.model; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.iqser.red.service.redaction.v1.model.Point; +import com.iqser.red.service.redaction.v1.model.Rectangle; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.apache.pdfbox.text.TextPosition; + import java.util.ArrayList; import java.util.List; - -import org.apache.pdfbox.text.TextPosition; - -import com.iqser.red.service.redaction.v1.model.Point; -import com.iqser.red.service.redaction.v1.model.Rectangle; - -import lombok.Data; -import lombok.RequiredArgsConstructor; +import java.util.stream.Collectors; @Data -@RequiredArgsConstructor +@NoArgsConstructor +@JsonIgnoreProperties({ "empty" }) public class TextPositionSequence implements CharSequence { - private List textPositions = new ArrayList<>(); + private int page; + private List textPositions = new ArrayList<>(); - private final int page; + private float x1; + private float x2; + + public TextPositionSequence(int page) { + this.page = page; + } + + + public static TextPositionSequence fromData(List textPositions, int page) { + var textPositionSequence = new TextPositionSequence(); + textPositionSequence.textPositions = textPositions; + textPositionSequence.page = page; + + return textPositionSequence; + } public TextPositionSequence(List textPositions, int page) { - this.textPositions = textPositions; + this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); this.page = page; } + + + @Override public int length() { @@ -37,7 +57,7 @@ public class TextPositionSequence implements CharSequence { @Override public char charAt(int index) { - TextPosition textPosition = textPositionAt(index); + RedTextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return text.charAt(0); } @@ -45,7 +65,7 @@ public class TextPositionSequence implements CharSequence { public char charAt(int index, boolean caseInSensitive) { - TextPosition textPosition = textPositionAt(index); + RedTextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0); } @@ -54,7 +74,7 @@ public class TextPositionSequence implements CharSequence { @Override public TextPositionSequence subSequence(int start, int end) { - return new TextPositionSequence(textPositions.subList(start, end), page); + return fromData(textPositions.subList(start, end), page); } @@ -69,18 +89,25 @@ public class TextPositionSequence implements CharSequence { } - public TextPosition textPositionAt(int index) { + public RedTextPosition textPositionAt(int index) { return textPositions.get(index); } - public void add(TextPosition textPosition) { + public void add(RedTextPosition textPosition) { this.textPositions.add(textPosition); } + public void add(TextPosition textPosition) { + + this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); + } + + + @JsonIgnore public float getX1() { if (textPositions.get(0).getRotation() == 90) { @@ -91,6 +118,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getX2() { if (textPositions.get(0).getRotation() == 90) { @@ -101,13 +129,14 @@ public class TextPositionSequence implements CharSequence { } } - + @JsonIgnore public float getRotationAdjustedY() { return textPositions.get(0).getY(); } + @JsonIgnore public float getY1() { if (textPositions.get(0).getRotation() == 90) { @@ -118,6 +147,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getY2() { if (textPositions.get(0).getRotation() == 90) { @@ -128,38 +158,40 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getTextHeight() { return textPositions.get(0).getHeightDir() + 2; } + @JsonIgnore public float getHeight() { return getY2() - getY1(); } + @JsonIgnore public float getWidth() { return getX2() - getX1(); } + @JsonIgnore public String getFont() { - - return textPositions.get(0) - .getFont() - .toString() + return textPositions.get(0).getFontName() .toLowerCase() .replaceAll(",bold", "") .replaceAll(",italic", ""); } + @JsonIgnore public String getFontStyle() { - String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase(); + String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) { return "bold, italic"; @@ -173,25 +205,25 @@ public class TextPositionSequence implements CharSequence { } - + @JsonIgnore public float getFontSize() { return textPositions.get(0).getFontSizeInPt(); } - + @JsonIgnore public float getSpaceWidth() { return textPositions.get(0).getWidthOfSpace(); } - + @JsonIgnore public int getRotation() { return textPositions.get(0).getRotation(); } - + @JsonIgnore public Rectangle getRectangle() { float height = getTextHeight(); @@ -223,4 +255,4 @@ public class TextPositionSequence implements CharSequence { return new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page); } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/MessagingConfiguration.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/MessagingConfiguration.java new file mode 100644 index 00000000..965163c0 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/MessagingConfiguration.java @@ -0,0 +1,35 @@ +package com.iqser.red.service.redaction.v1.server.queue; + +import lombok.RequiredArgsConstructor; +import org.springframework.amqp.core.Queue; +import org.springframework.amqp.core.QueueBuilder; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +@RequiredArgsConstructor +public class MessagingConfiguration { + + + public static final String REDACTION_QUEUE = "redactionQueue"; + + public static final String REDACTION_DQL = "redactionDQL"; + + + @Bean + public Queue redactionQueue() { + + return QueueBuilder.durable(REDACTION_QUEUE) + .withArgument("x-dead-letter-exchange", "") + .withArgument("x-dead-letter-routing-key", REDACTION_QUEUE) + .maxPriority(2) + .build(); + } + + + @Bean + public Queue redactionDeadLetterQueue() { + + return QueueBuilder.durable(REDACTION_DQL).build(); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/RedactionMessageReceiver.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/RedactionMessageReceiver.java new file mode 100644 index 00000000..2a08e4c8 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/RedactionMessageReceiver.java @@ -0,0 +1,54 @@ +package com.iqser.red.service.redaction.v1.server.queue; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; +import com.iqser.red.service.redaction.v1.model.AnalyzeResult; +import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient; +import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.amqp.rabbit.annotation.RabbitHandler; +import org.springframework.amqp.rabbit.annotation.RabbitListener; +import org.springframework.stereotype.Service; + +import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfiguration.REDACTION_DQL; +import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfiguration.REDACTION_QUEUE; + +@Slf4j +@Service +@RequiredArgsConstructor +public class RedactionMessageReceiver { + + private final ObjectMapper objectMapper; + private final ReanalyzeService reanalyzeService; + private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; + + @RabbitHandler + @RabbitListener(queues = REDACTION_QUEUE) + public void receiveAnalyzeRequest(String in) throws JsonProcessingException { + + var analyzeRequest = objectMapper.readValue(in, AnalyzeRequest.class); + log.info("Processing analyze request: {}", analyzeRequest); + AnalyzeResult result; + if (analyzeRequest.isReanalyseOnlyIfPossible()) { + result = reanalyzeService.reanalyze(analyzeRequest); + } else { + result = reanalyzeService.analyze(analyzeRequest); + } + log.info("Successfully analyzed {}", analyzeRequest); + + fileStatusProcessingUpdateClient.analysisSuccessful(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), result); + } + + @RabbitHandler + @RabbitListener(queues = REDACTION_DQL) + public void receiveAnalyzeRequestDQL(String in) throws JsonProcessingException { + + var analyzeRequest = objectMapper.readValue(in, AnalyzeRequest.class); + log.info("Failed to process analyze request: {}", analyzeRequest); + + fileStatusProcessingUpdateClient.analysisFailed(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java index c566bbb5..e38c8cf2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java @@ -1,22 +1,25 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; -import java.util.Iterator; -import java.util.List; - import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; -import lombok.Value; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; -@Value +@Data +@NoArgsConstructor +@AllArgsConstructor public class CellValue { - private List textBlocks; + private List textBlocks = new ArrayList<>(); private int rowSpanStart; - @Override public String toString() { @@ -47,4 +50,4 @@ public class CellValue { .replaceAll(" {2}", " "); } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Dictionary.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Dictionary.java index c1fd7719..3879bb80 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Dictionary.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Dictionary.java @@ -1,13 +1,13 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import lombok.Data; +import lombok.Getter; + import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; -import lombok.Data; -import lombok.Getter; - @Data public class Dictionary { @@ -18,18 +18,18 @@ public class Dictionary { private Map localAccessMap = new HashMap<>(); @Getter - private long version; + private DictionaryVersion version; - public Dictionary(List dictionaryModels, long dictionaryVersion){ + public Dictionary(List dictionaryModels, DictionaryVersion version) { this.dictionaryModels = dictionaryModels; this.dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), dm)); - this.version = dictionaryVersion; + this.version = version; } - public int getDictionaryRank(String type){ - if(!localAccessMap.containsKey(type)){ + public int getDictionaryRank(String type) { + if (!localAccessMap.containsKey(type)) { return 0; } return localAccessMap.get(type).getRank(); @@ -60,7 +60,7 @@ public class Dictionary { public boolean containsValue(String type, String value) { - if (localAccessMap.containsKey(type) && localAccessMap.get(type) + return localAccessMap.containsKey(type) && localAccessMap.get(type) .getEntries() .contains(value) || localAccessMap.containsKey(type) && localAccessMap.get(type) .getLocalEntries() @@ -68,10 +68,7 @@ public class Dictionary { .getEntries() .contains(value) || localAccessMap.containsKey(RECOMMENDATION_PREFIX + type) && localAccessMap.get(RECOMMENDATION_PREFIX + type) .getLocalEntries() - .contains(value)) { - return true; - } - return false; + .contains(value); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryIncrement.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryIncrement.java index 2366527e..29a71403 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryIncrement.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryIncrement.java @@ -1,15 +1,15 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; -import java.util.Set; - import lombok.AllArgsConstructor; import lombok.Data; +import java.util.Set; + @Data @AllArgsConstructor public class DictionaryIncrement { private Set values; - private long dictionaryVersion; + private DictionaryVersion dictionaryVersion; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryModel.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryModel.java index c4b0ce7c..c3a80b96 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryModel.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryModel.java @@ -1,15 +1,14 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry; +import lombok.AllArgsConstructor; +import lombok.Data; + import java.io.Serializable; import java.util.Set; import java.util.stream.Collectors; -import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry; - -import lombok.AllArgsConstructor; -import lombok.Data; - @Data @AllArgsConstructor public class DictionaryModel implements Serializable { @@ -22,9 +21,10 @@ public class DictionaryModel implements Serializable { private boolean recommendation; private Set entries; private Set localEntries; + private boolean isDossierDictionary; - public Set getValues(boolean local){ - return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e-> e.getValue()).collect(Collectors + public Set getValues(boolean local) { + return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e -> e.getValue()).collect(Collectors .toSet()); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryRepresentation.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryRepresentation.java index 615e4dda..0f7b6820 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryRepresentation.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryRepresentation.java @@ -20,5 +20,4 @@ public class DictionaryRepresentation { private Map localAccessMap = new HashMap<>(); - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryVersion.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryVersion.java new file mode 100644 index 00000000..6a69bb60 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryVersion.java @@ -0,0 +1,16 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class DictionaryVersion { + + long rulesetVersion; + long dossierVersion; +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java index 8c8c8952..c9fdc711 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java @@ -1,13 +1,12 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import lombok.Data; +import lombok.EqualsAndHashCode; + import java.util.ArrayList; import java.util.List; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; - -import lombok.Data; -import lombok.EqualsAndHashCode; - @Data @EqualsAndHashCode(onlyExplicitlyIncluded = true) public class Entity { @@ -38,8 +37,10 @@ public class Entity { private String textBefore; private String textAfter; + private boolean isDossierDictionaryEntry; - public Entity(String word, String type, boolean redaction, String redactionReason, List positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter, Integer start, Integer end) { + + public Entity(String word, String type, boolean redaction, String redactionReason, List positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter, Integer start, Integer end, boolean isDossierDictionaryEntry) { this.word = word; this.type = type; @@ -55,10 +56,11 @@ public class Entity { this.textAfter = textAfter; this.start = start; this.end = end; + this.isDossierDictionaryEntry = isDossierDictionaryEntry; } - public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber, boolean isDictionaryEntry) { + public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber, boolean isDictionaryEntry, boolean isDossierDictionaryEntry) { this.word = word; this.type = type; @@ -67,6 +69,7 @@ public class Entity { this.headline = headline; this.sectionNumber = sectionNumber; this.isDictionaryEntry = isDictionaryEntry; + this.isDossierDictionaryEntry = isDossierDictionaryEntry; } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/EntityPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/EntityPositionSequence.java index 9bd0fb38..6784707d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/EntityPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/EntityPositionSequence.java @@ -1,24 +1,23 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; -import java.util.ArrayList; -import java.util.List; - import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; - import lombok.AllArgsConstructor; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.RequiredArgsConstructor; +import java.util.ArrayList; +import java.util.List; + @Data @RequiredArgsConstructor @AllArgsConstructor @EqualsAndHashCode public class EntityPositionSequence { + private final String id; @EqualsAndHashCode.Exclude private List sequences = new ArrayList<>(); private int pageNumber; - private final String id; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java index 377fd55b..766d607d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java @@ -1,7 +1,5 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; -import java.awt.geom.Rectangle2D; - import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; @@ -14,7 +12,7 @@ import lombok.NoArgsConstructor; public class Image { private String type; - private Rectangle2D position; + private RedRectangle2D position; private boolean redaction; private String redactionReason; private String legalBasis; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java index 86cabcfa..1631717f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java @@ -1,28 +1,31 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import com.fasterxml.jackson.annotation.JsonIgnore; +import lombok.Data; +import lombok.NonNull; +import lombok.RequiredArgsConstructor; + import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.NoArgsConstructor; -import lombok.NonNull; -import lombok.RequiredArgsConstructor; - @Data -@NoArgsConstructor -@AllArgsConstructor @RequiredArgsConstructor public class PdfImage { - @NonNull + @JsonIgnore private BufferedImage image; @NonNull - private Rectangle2D position; + private RedRectangle2D position; private ImageType imageType; private boolean isAppendedToParagraph; @NonNull private int page; -} \ No newline at end of file + public PdfImage(BufferedImage image, Rectangle2D position, int page) { + this.image = image; + this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight()); + this.page = page; + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java deleted file mode 100644 index be141819..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java +++ /dev/null @@ -1,37 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; - -import lombok.Data; -import lombok.NoArgsConstructor; - -@Data -@NoArgsConstructor -public class ReanalysisSection { - - private int sectionNumber; - private String headline; - private List textBlocks; - private Map tabularData = new HashMap<>(); - private List cellStarts; - private Set images = new HashSet<>(); - - - public SearchableText getSearchableText() { - - SearchableText searchableText = new SearchableText(); - textBlocks.forEach(block -> { - if (block instanceof TextBlock) { - searchableText.addAll(block.getSequences()); - } - }); - return searchableText; - } - -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java new file mode 100644 index 00000000..601d328c --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java @@ -0,0 +1,35 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + + +import com.fasterxml.jackson.annotation.JsonIgnore; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +@AllArgsConstructor +public class RedRectangle2D { + + private double x; + private double y; + private double width; + private double height; + + @JsonIgnore + public boolean isEmpty() { + return width <= 0.0f || height <= 0.0f; + } + + public boolean contains(double x, double y, double w, double h) { + if (isEmpty() || w <= 0 || h <= 0) { + return false; + } + double x0 = getX(); + double y0 = getY(); + return x >= x0 && + y >= y0 && + (x + w) <= x0 + getWidth() && + (y + h) <= y0 + getHeight(); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index dfa500ea..b7277c9a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -1,14 +1,14 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; + import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.regex.Pattern; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; -import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; - public class SearchableText { private final List sequences = new ArrayList<>(); @@ -232,4 +232,4 @@ public class SearchableText { return sb.append("\n").toString(); } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 8dc46fee..fa44f983 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -1,6 +1,12 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; -import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; +import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; +import lombok.Builder; +import lombok.Data; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; import java.util.Collection; import java.util.HashMap; @@ -11,15 +17,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; - -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; -import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; - -import lombok.Builder; -import lombok.Data; -import lombok.extern.slf4j.Slf4j; +import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX; @Data @Slf4j @@ -413,7 +411,7 @@ public class Section { String text = caseInsensitive ? searchText.toLowerCase() : searchText; String searchValue = caseInsensitive ? value.toLowerCase() : value; - Set found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, true); + Set found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, true, false); found.forEach(entity -> { if (redacted) { @@ -439,7 +437,7 @@ public class Section { } else { String word = value.toString(); - Entity entity = new Entity(word, type, value.getRowSpanStart(), value.getRowSpanStart() + word.length(), headline, sectionNumber, false); + Entity entity = new Entity(word, type, value.getRowSpanStart(), value.getRowSpanStart() + word.length(), headline, sectionNumber, false, false); entity.setRedaction(redact); entity.setMatchedRule(ruleNumber); entity.setRedactionReason(reason); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnalyzeResponseService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnalyzeResponseService.java new file mode 100644 index 00000000..ca2962c2 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnalyzeResponseService.java @@ -0,0 +1,48 @@ +package com.iqser.red.service.redaction.v1.server.redaction.service; + +import com.iqser.red.service.redaction.v1.model.AnalyzeResult; +import com.iqser.red.service.redaction.v1.model.RedactionChangeLog; +import com.iqser.red.service.redaction.v1.model.RedactionLog; +import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; +import org.springframework.stereotype.Service; + +@Service +public class AnalyzeResponseService { + + public AnalyzeResult createAnalyzeResponse(String projectId, String fileId, long duration, int pageCount, RedactionLog redactionLog, RedactionChangeLog redactionChangeLog) { + boolean hasHints = redactionLog.getRedactionLogEntry().stream().anyMatch(RedactionLogEntry::isHint); + + boolean hasRequests = redactionLog.getRedactionLogEntry() + .stream() + .anyMatch(entry -> entry.isManual() && entry.getStatus() + .equals(com.iqser.red.service.redaction.v1.model.Status.REQUESTED)); + + boolean hasRedactions = redactionLog.getRedactionLogEntry() + .stream() + .anyMatch(entry -> entry.isRedacted() && !entry.isManual() || entry.isManual() && entry.getStatus() + .equals(com.iqser.red.service.redaction.v1.model.Status.APPROVED)); + + boolean hasImages = redactionLog.getRedactionLogEntry() + .stream() + .anyMatch(entry -> entry.isHint() && entry.getType().equals("image")); + + boolean hasUpdates = redactionChangeLog != null && redactionChangeLog.getRedactionLogEntry() != null && !redactionChangeLog + .getRedactionLogEntry() + .isEmpty() && redactionChangeLog.getRedactionLogEntry().stream().anyMatch(entry -> !entry.getType().equals("false_positive")); + + return AnalyzeResult.builder() + .projectId(projectId) + .fileId(fileId) + .duration(duration) + .numberOfPages(pageCount) + .hasHints(hasHints) + .hasRedactions(hasRedactions) + .hasRequests(hasRequests) + .hasImages(hasImages) + .hasUpdates(hasUpdates) + .rulesVersion(redactionLog.getRulesVersion()) + .dictionaryVersion(redactionLog.getDictionaryVersion()) + .dossierDictionaryVersion(redactionLog.getDossierDictionaryVersion()) + .build(); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnnotationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnnotationService.java index 9af17d2f..dd0d1b75 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnnotationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnnotationService.java @@ -1,14 +1,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import java.awt.Color; -import java.io.IOException; -import java.util.ArrayList; -import java.util.GregorianCalendar; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - +import com.iqser.red.service.redaction.v1.model.*; +import lombok.RequiredArgsConstructor; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; @@ -21,15 +14,14 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup; import org.springframework.stereotype.Service; -import com.iqser.red.service.redaction.v1.model.CellRectangle; -import com.iqser.red.service.redaction.v1.model.Comment; -import com.iqser.red.service.redaction.v1.model.Rectangle; -import com.iqser.red.service.redaction.v1.model.RedactionLog; -import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; -import com.iqser.red.service.redaction.v1.model.SectionGrid; -import com.iqser.red.service.redaction.v1.model.SectionRectangle; - -import lombok.RequiredArgsConstructor; +import java.awt.Color; +import java.io.IOException; +import java.util.ArrayList; +import java.util.GregorianCalendar; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; @Service @RequiredArgsConstructor diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java index 8fd948b1..d9577bd0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java @@ -1,19 +1,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import java.awt.Color; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.commons.collections4.CollectionUtils; -import org.apache.commons.lang3.SerializationUtils; -import org.springframework.stereotype.Service; +import static com.iqser.red.service.configuration.v1.api.resource.DictionaryResource.GLOBAL_DOSSIER; import com.iqser.red.service.configuration.v1.api.model.Colors; import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry; @@ -25,10 +12,18 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncre import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation; +import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion; import feign.FeignException; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.SerializationUtils; +import org.springframework.stereotype.Service; + +import java.awt.Color; +import java.util.*; +import java.util.stream.Collectors; @Slf4j @Service @@ -37,53 +32,69 @@ public class DictionaryService { private final DictionaryClient dictionaryClient; - private Map dictionariesByRuleSets = new HashMap<>(); + private final Map dictionariesByRuleSets = new HashMap<>(); + private final Map dictionariesByDossier = new HashMap<>(); - public long updateDictionary(String ruleSetId) { + public DictionaryVersion updateDictionary(String ruleSetId, String dossierId) { - long version = dictionaryClient.getVersion(ruleSetId); - - var foundDictionary = dictionariesByRuleSets.get(ruleSetId); - - if (foundDictionary == null || version > foundDictionary.getDictionaryVersion()) { - updateDictionaryEntry(ruleSetId, version); + long rulesetDictionaryVersion = dictionaryClient.getVersion(ruleSetId, GLOBAL_DOSSIER); + var rulesetDictionary = dictionariesByRuleSets.get(ruleSetId); + if (rulesetDictionary == null || rulesetDictionaryVersion > rulesetDictionary.getDictionaryVersion()) { + updateDictionaryEntry(ruleSetId, rulesetDictionaryVersion, GLOBAL_DOSSIER); } - return version; + long dossierDictionaryVersion = dictionaryClient.getVersion(ruleSetId, dossierId); + var dossierDictionary = dictionariesByDossier.get(dossierId); + if (dossierDictionary == null || dossierDictionaryVersion > dossierDictionary.getDictionaryVersion()) { + updateDictionaryEntry(ruleSetId, dossierDictionaryVersion, dossierId); + } + + return DictionaryVersion.builder().rulesetVersion(rulesetDictionaryVersion).dossierVersion(dossierDictionaryVersion).build(); } - public DictionaryIncrement getDictionaryIncrements(String ruleSetId, long fromVersion) { + public DictionaryIncrement getDictionaryIncrements(String ruleSetId, DictionaryVersion fromVersion, String dossierId) { - long version = updateDictionary(ruleSetId); + DictionaryVersion version = updateDictionary(ruleSetId, dossierId); Set newValues = new HashSet<>(); List dictionaryModels = dictionariesByRuleSets.get(ruleSetId).getDictionary(); dictionaryModels.forEach(dictionaryModel -> { dictionaryModel.getEntries().forEach(dictionaryEntry -> { - if (dictionaryEntry.getVersion() > fromVersion) { + if (dictionaryEntry.getVersion() > fromVersion.getRulesetVersion()) { newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive())); } }); }); + if(dictionariesByDossier.containsKey(dossierId)) { + dictionaryModels = dictionariesByDossier.get(dossierId).getDictionary(); + dictionaryModels.forEach(dictionaryModel -> { + dictionaryModel.getEntries().forEach(dictionaryEntry -> { + if (dictionaryEntry.getVersion() > fromVersion.getDossierVersion()) { + newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive())); + } + }); + }); + } + return new DictionaryIncrement(newValues, version); } - private void updateDictionaryEntry(String ruleSetId, long version) { + private void updateDictionaryEntry(String ruleSetId, long version, String dossierId) { try { DictionaryRepresentation dictionaryRepresentation = new DictionaryRepresentation(); - TypeResponse typeResponse = dictionaryClient.getAllTypes(ruleSetId); + TypeResponse typeResponse = dictionaryClient.getAllTypes(ruleSetId, dossierId); if (typeResponse != null && CollectionUtils.isNotEmpty(typeResponse.getTypes())) { List dictionary = typeResponse.getTypes() .stream() .map(t -> new DictionaryModel(t.getType(), t.getRank(), convertColor(t.getHexColor()), t.isCaseInsensitive(), t - .isHint(), t.isRecommendation(), convertEntries(t), new HashSet<>())) + .isHint(), t.isRecommendation(), convertEntries(t, dossierId), new HashSet<>(),dossierId.equals(GLOBAL_DOSSIER) ? false : true)) .sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed()) .collect(Collectors.toList()); @@ -99,7 +110,11 @@ public class DictionaryService { dictionaryRepresentation.setDictionaryVersion(version); dictionaryRepresentation.setDictionary(dictionary); - dictionariesByRuleSets.put(ruleSetId, dictionaryRepresentation); + if(dossierId.equals(GLOBAL_DOSSIER)) { + dictionariesByRuleSets.put(ruleSetId, dictionaryRepresentation); + } else { + dictionariesByDossier.put(dossierId, dictionaryRepresentation); + } } } catch (FeignException e) { log.warn("Got some unknown feignException", e); @@ -112,19 +127,19 @@ public class DictionaryService { dictionary.getDictionaryModels().forEach(dm -> { if (dm.isRecommendation() && !dm.getLocalEntries().isEmpty()) { - dictionaryClient.addEntries(dm.getType(), ruleSetId, new ArrayList<>(dm.getLocalEntries()), false); - long externalVersion = dictionaryClient.getVersion(ruleSetId); - if (externalVersion == dictionary.getVersion() + 1) { - dictionary.setVersion(externalVersion); + dictionaryClient.addEntries(dm.getType(), ruleSetId, new ArrayList<>(dm.getLocalEntries()), false, GLOBAL_DOSSIER); + long externalVersion = dictionaryClient.getVersion(ruleSetId, GLOBAL_DOSSIER); + if (externalVersion == dictionary.getVersion().getRulesetVersion() + 1) { + dictionary.getVersion().setRulesetVersion(externalVersion); } } }); } - private Set convertEntries(TypeResult t) { + private Set convertEntries(TypeResult t, String dossierId) { - Set entries = new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId()) + Set entries = new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId(), dossierId) .getEntries()); if (t.isCaseInsensitive()) { @@ -181,17 +196,26 @@ public class DictionaryService { } - public Dictionary getDeepCopyDictionary(String ruleSetId) { + public Dictionary getDeepCopyDictionary(String ruleSetId, String dossierId) { List copy = new ArrayList<>(); - var representation = dictionariesByRuleSets.get(ruleSetId); - var dictionary = dictionariesByRuleSets.get(ruleSetId).getDictionary(); - dictionary.forEach(dm -> { + var rulesetRepresentation = dictionariesByRuleSets.get(ruleSetId); + rulesetRepresentation.getDictionary().forEach(dm -> { copy.add(SerializationUtils.clone(dm)); }); - return new Dictionary(copy, representation.getDictionaryVersion()); + //TODO merge dictionaries if they have same names + long dossierDictionaryVersion = -1; + if(dictionariesByDossier.containsKey(dossierId)) { + var dossierRepresentation = dictionariesByDossier.get(dossierId); + dossierRepresentation.getDictionary().forEach(dm -> { + copy.add(SerializationUtils.clone(dm)); + }); + dossierDictionaryVersion = dossierRepresentation.getDictionaryVersion(); + } + + return new Dictionary(copy, DictionaryVersion.builder().rulesetVersion(rulesetRepresentation.getDictionaryVersion()).dossierVersion(dossierDictionaryVersion).build()); } @@ -212,4 +236,4 @@ public class DictionaryService { return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor(); } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DroolsExecutionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DroolsExecutionService.java index 7b8233b5..708efa12 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DroolsExecutionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DroolsExecutionService.java @@ -1,11 +1,10 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import java.io.ByteArrayInputStream; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.Map; - +import com.iqser.red.service.configuration.v1.api.model.RulesResponse; +import com.iqser.red.service.redaction.v1.server.client.RulesClient; +import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException; +import com.iqser.red.service.redaction.v1.server.redaction.model.Section; +import lombok.RequiredArgsConstructor; import org.apache.commons.lang3.StringUtils; import org.kie.api.KieServices; import org.kie.api.builder.KieBuilder; @@ -15,12 +14,11 @@ import org.kie.api.runtime.KieContainer; import org.kie.api.runtime.KieSession; import org.springframework.stereotype.Service; -import com.iqser.red.service.configuration.v1.api.model.RulesResponse; -import com.iqser.red.service.redaction.v1.server.client.RulesClient; -import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException; -import com.iqser.red.service.redaction.v1.server.redaction.model.Section; - -import lombok.RequiredArgsConstructor; +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; @Service @RequiredArgsConstructor @@ -28,9 +26,9 @@ public class DroolsExecutionService { private final RulesClient rulesClient; - private Map kieContainers = new HashMap<>(); + private final Map kieContainers = new HashMap<>(); - private Map rulesVersionPerRuleSetId = new HashMap<>(); + private final Map rulesVersionPerRuleSetId = new HashMap<>(); public KieContainer getKieContainer(String ruleSetId) { @@ -133,4 +131,4 @@ public class DroolsExecutionService { return rulesVersion.longValue(); } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index b26d2111..3e8875af 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -1,50 +1,27 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import org.apache.commons.collections4.CollectionUtils; -import org.apache.commons.lang3.StringUtils; -import org.kie.api.runtime.KieContainer; -import org.springframework.stereotype.Service; - import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; import com.iqser.red.service.redaction.v1.model.ManualRedactions; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.model.SectionArea; -import com.iqser.red.service.redaction.v1.model.SectionText; -import com.iqser.red.service.redaction.v1.server.classification.model.Document; -import com.iqser.red.service.redaction.v1.server.classification.model.Footer; -import com.iqser.red.service.redaction.v1.server.classification.model.Header; -import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText; -import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; +import com.iqser.red.service.redaction.v1.server.classification.model.*; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; -import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; -import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; -import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.Image; -import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; -import com.iqser.red.service.redaction.v1.server.redaction.model.Section; -import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair; +import com.iqser.red.service.redaction.v1.server.redaction.model.*; import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; - import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.kie.api.runtime.KieContainer; +import org.springframework.stereotype.Service; + +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import java.util.stream.Stream; @Slf4j @Service @@ -56,13 +33,13 @@ public class EntityRedactionService { private final SurroundingWordsService surroundingWordsService; - public void processDocument(Document classifiedDoc, String ruleSetId, ManualRedactions manualRedactions) { + public void processDocument(Document classifiedDoc, String ruleSetId, ManualRedactions manualRedactions, String dossierId) { - dictionaryService.updateDictionary(ruleSetId); + dictionaryService.updateDictionary(ruleSetId, dossierId); KieContainer container = droolsExecutionService.updateRules(ruleSetId); long rulesVersion = droolsExecutionService.getRulesVersion(ruleSetId); - Dictionary dictionary = dictionaryService.getDeepCopyDictionary(ruleSetId); + Dictionary dictionary = dictionaryService.getDeepCopyDictionary(ruleSetId, dossierId); Set documentEntities = new HashSet<>(findEntities(classifiedDoc, container, manualRedactions, dictionary, false, null)); @@ -94,7 +71,7 @@ public class EntityRedactionService { .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity - .getStart(), entity.getEnd())); + .getStart(), entity.getEnd(), entity.isDossierDictionaryEntry())); } } @@ -210,6 +187,7 @@ public class EntityRedactionService { .get(0) .getPage()); sectionText.getSectionAreas().add(sectionArea); + sectionText.getTextBlocks().addAll(cell.getTextBlocks()); addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue()); int cellStart = start; @@ -258,6 +236,8 @@ public class EntityRedactionService { sectionText.setHeadline(table.getHeadline()); sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setTable(true); + sectionText.setTabularData(tabularData); + sectionText.setCellStarts(cellStarts); classifiedDoc.getSectionText().add(sectionText); } @@ -290,6 +270,7 @@ public class EntityRedactionService { .getSequences() .get(0) .getPage()); + sectionText.getTextBlocks().addAll(cell.getTextBlocks()); sectionText.getSectionAreas().add(sectionArea); } @@ -348,6 +329,10 @@ public class EntityRedactionService { sectionText.setHeadline(headline); sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setTable(false); + sectionText.setImages(images.stream() + .map(image -> convert(image, sectionNumber.intValue(), headline)) + .collect(Collectors.toSet())); + sectionText.setTextBlocks(paragraphTextBlocks); classifiedDoc.getSectionText().add(sectionText); } @@ -386,9 +371,9 @@ public class EntityRedactionService { String lowercaseInputString = searchableString.toLowerCase(); for (DictionaryModel model : dictionary.getDictionaryModels()) { if (model.isCaseInsensitive()) { - found.addAll(EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local)); + found.addAll(EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local, model.isDossierDictionary())); } else { - found.addAll(EntitySearchUtils.find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local)); + found.addAll(EntitySearchUtils.find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local, model.isDossierDictionary())); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java index 94dc3a94..a845af9c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java @@ -1,21 +1,17 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import java.io.ByteArrayOutputStream; -import java.io.IOException; - -import javax.imageio.ImageIO; - -import org.springframework.stereotype.Service; - -import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient; import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse; import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile; import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; - import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; + +import javax.imageio.ImageIO; +import java.io.ByteArrayOutputStream; @Slf4j @Service @@ -26,37 +22,41 @@ public class ImageClassificationService { private final RedactionServiceSettings settings; - public void classifyImages(Document classifiedDoc) { + public void classifyImages(Page page) { - long start = System.currentTimeMillis(); - classifiedDoc.getPages().forEach(page -> { - page.getImages().forEach(image -> { + page.getImages().forEach(image -> { - if (settings.isEnableImageClassification()) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - ImageIO.write(image.getImage(), "png", baos); - ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos - .toByteArray())); - image.setImageType(ImageType.valueOf(response.getCategory())); + if (settings.isEnableImageClassification()) { - } catch (IOException e) { - log.error("Could not classify image", e); - } - } else { + long start = System.currentTimeMillis(); + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + ImageIO.write(image.getImage(), "png", baos); + var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray()); + ImageClassificationResponse response = imageClassificationClient.classify(mockFile); + image.setImageType(ImageType.valueOf(response.getCategory())); + } catch (Exception e) { + log.error("Could not classify image", e); image.setImageType(ImageType.OTHER); } - if (image.getImageType().equals(ImageType.OTHER)) { - page.getTextBlocks().forEach(textblock -> { - if (image.getPosition() - .contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) { - image.setImageType(ImageType.OCR); - } - }); - } - }); + log.info("Image classification took: " + (System.currentTimeMillis() - start)); + } else { + image.setImageType(ImageType.OTHER); + } + + image.getImage().flush(); + image.setImage(null); + + if (image.getImageType().equals(ImageType.OTHER)) { + page.getTextBlocks().forEach(textblock -> { + if (image.getPosition() + .contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) { + image.setImageType(ImageType.OCR); + } + }); + } }); - log.info("Image classification took: " + (System.currentTimeMillis() - start)); + } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index 5ea56c79..edcdbee5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -1,53 +1,30 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import java.awt.geom.Rectangle2D; -import java.io.ByteArrayInputStream; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; +import com.iqser.red.service.file.management.v1.api.model.FileType; +import com.iqser.red.service.redaction.v1.model.*; +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; +import com.iqser.red.service.redaction.v1.server.classification.model.Text; +import com.iqser.red.service.redaction.v1.server.exception.RedactionException; +import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; +import com.iqser.red.service.redaction.v1.server.redaction.model.*; +import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; +import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; + +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.kie.api.runtime.KieContainer; import org.springframework.stereotype.Service; import org.springframework.web.bind.annotation.RequestBody; -import com.iqser.red.service.redaction.v1.model.Comment; -import com.iqser.red.service.redaction.v1.model.IdRemoval; -import com.iqser.red.service.redaction.v1.model.ManualForceRedact; -import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; -import com.iqser.red.service.redaction.v1.model.ManualRedactions; -import com.iqser.red.service.redaction.v1.model.ReanalyzeResult; -import com.iqser.red.service.redaction.v1.model.Rectangle; -import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; -import com.iqser.red.service.redaction.v1.model.RenalyzeRequest; -import com.iqser.red.service.redaction.v1.model.SectionArea; -import com.iqser.red.service.redaction.v1.model.SectionText; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.exception.RedactionException; -import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; -import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; -import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement; -import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; -import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.Image; -import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection; -import com.iqser.red.service.redaction.v1.server.redaction.model.Section; -import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair; -import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; - -import lombok.RequiredArgsConstructor; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; +@Slf4j @Service @RequiredArgsConstructor public class ReanalyzeService { @@ -57,39 +34,98 @@ public class ReanalyzeService { private final SurroundingWordsService surroundingWordsService; private final EntityRedactionService entityRedactionService; private final RedactionLogCreatorService redactionLogCreatorService; + private final RedactionStorageService redactionStorageService; + private final PdfSegmentationService pdfSegmentationService; + private final RedactionChangeLogService redactionChangeLogService; + private final AnalyzeResponseService analyzeResponseService; - public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) { + public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) { - DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), renalyzeRequest - .getRedactionLog() - .getDictionaryVersion()); + long startTime = System.currentTimeMillis(); - Set manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions()); + var pageCount = 0; + Document classifiedDoc; + + try { + var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest + .getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN)); + classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); + pageCount = classifiedDoc.getPages().size(); + } catch (Exception e) { + throw new RedactionException(e); + } + log.info("Document structure analysis successful, starting redaction analysis..."); + + entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions(), analyzeRequest + .getProjectId()); + redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest + .getRuleSetId()); + + log.info("Redaction analysis successful..."); + + var redactionLog = new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion() + .getRulesetVersion(), classifiedDoc.getRulesVersion(), analyzeRequest.getRuleSetId(), classifiedDoc.getDictionaryVersion() + .getDossierVersion()); + + log.info("Analyzed with rules {} and dictionary {} for ruleSet: {}", classifiedDoc.getRulesVersion(), classifiedDoc + .getDictionaryVersion(), analyzeRequest.getRuleSetId()); + + // first create changelog - this only happens when we migrate files analyzed via the old process and we don't want to loose changeLog data + var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog); + // store redactionLog + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.TEXT, new Text(pageCount, classifiedDoc + .getSectionText())); + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc + .getSectionGrid()); + + long duration = System.currentTimeMillis() - startTime; + return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), duration, pageCount, redactionLog, changeLog); + } + + + @SneakyThrows + public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) { + + long startTime = System.currentTimeMillis(); + + var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); + var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); + + // not yet ready for reanalysis + if (redactionLog == null || text == null || text.getNumberOfPages() == 0) { + return analyze(analyzeRequest); + } + + DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), new DictionaryVersion(redactionLog + .getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getProjectId()); + + Set manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions()); Map> comments = null; Set manualAdds = null; - if (renalyzeRequest.getManualRedactions() != null) { + if (analyzeRequest.getManualRedactions() != null) { // TODO comments will be removed from redactionLog, so we ignore this first. - comments = renalyzeRequest.getManualRedactions().getComments(); - manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd(); + comments = analyzeRequest.getManualRedactions().getComments(); + manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd(); } - Set sectionsToReanaylse = new HashSet<>(); + Set sectionsToReanalyse = new HashSet<>(); Map> imageEntries = new HashMap<>(); - for (RedactionLogEntry entry : renalyzeRequest.getRedactionLog().getRedactionLogEntry()) { + for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) { if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) { - sectionsToReanaylse.add(entry.getSectionNumber()); + sectionsToReanalyse.add(entry.getSectionNumber()); } if (entry.isImage() || entry.getType().equals("image")) { imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry)); } } - for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) { + for (SectionText sectionText : text.getSectionTexts()) { if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) { - sectionsToReanaylse.add(sectionText.getSectionNumber()); + sectionsToReanalyse.add(sectionText.getSectionNumber()); } if (manualAdds != null) { @@ -106,187 +142,123 @@ public class ReanalyzeService { } } - if (sectionsToReanaylse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) { - renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); - return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build(); + log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest); + + if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) { + return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement); } - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(renalyzeRequest.getDocument()))) { + List reanalysisSections = new ArrayList<>(); - List reanalysisSections = new ArrayList<>(); - for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) { + for (SectionText sectionText : text.getSectionTexts()) { - if (!sectionsToReanaylse.contains(sectionText.getSectionNumber())) { - continue; - } + if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) { + reanalysisSections.add(sectionText); + } + } - ReanalysisSection reanalysisSection = new ReanalysisSection(); - reanalysisSection.setHeadline(sectionText.getHeadline()); - reanalysisSection.setSectionNumber(sectionText.getSectionNumber()); - List textBlocks = new ArrayList<>(); + //-- - Map> sectionAreasPerPage = new HashMap<>(); - for (SectionArea sectionArea : sectionText.getSectionAreas()) { - sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>()) - .add(sectionArea); - } + KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId()); - Map tabularData = new HashMap<>(); - List cellStarts = new ArrayList<>(); - for (Integer page : sectionAreasPerPage.keySet()) { - List areasOnPage = sectionAreasPerPage.get(page); + Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId(), analyzeRequest.getProjectId()); - PDPage pdPage = pdDocument.getPage(page - 1); - PDRectangle cropBox = pdPage.getCropBox(); - PDFAreaTextStripper textStripper = new PDFAreaTextStripper(); - textStripper.setPageNumber(page); + List sectionSearchableTextPairs = new ArrayList<>(); + for (SectionText reanalysisSection : reanalysisSections) { - int cellStart = 0; - for (SectionArea sectionArea : areasOnPage) { - - Rectangle2D rect = null; - if (pdPage.getRotation() == 90) { - rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft() - .getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f); - } else { - rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft() - .getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea - .getHeight() + 0.001f); - } - - textStripper.addRegion(String.valueOf(1), rect); - textStripper.extractRegions(pdPage); - textStripper.getTextForRegion(String.valueOf(1)); - List positions = textStripper.getTextPositionSequences(); - - TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft() - .getX() + sectionArea.getWidth(), sectionArea.getTopLeft() - .getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0); - - if (sectionText.isTable()) { - Cell cell = new Cell(); - cell.addTextBlock(textBlock); - tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart)); - cellStarts.add(cellStart); - cellStart = cellStart + cell.toString().trim().length() + 1; - } - - textBlocks.add(textBlock); - textStripper.clearPositions(); - } - - } - reanalysisSection.setTextBlocks(textBlocks); - reanalysisSection.setTabularData(tabularData); - - if (sectionText.isTable()) { - reanalysisSection.setCellStarts(cellStarts); - } - if (imageEntries.containsKey(sectionText.getSectionNumber())) { - reanalysisSection.getImages().addAll(imageEntries.get(sectionText.getSectionNumber())); - } - - reanalysisSections.add(reanalysisSection); + Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection + .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); + if (reanalysisSection.getCellStarts() != null) { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection + .getCellStarts()); + } else { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); } - //-- + sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() + .isLocal(false) + .dictionaryTypes(dictionary.getTypes()) + .entities(entities) + .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) + .searchText(reanalysisSection.getSearchableText().toString()) + .headline(reanalysisSection.getHeadline()) + .sectionNumber(reanalysisSection.getSectionNumber()) + .tabularData(reanalysisSection.getTabularData()) + .searchableText(reanalysisSection.getSearchableText()) + .dictionary(dictionary) + .images(reanalysisSection.getImages()) + .build(), reanalysisSection.getSearchableText())); + } - KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId()); + Set entities = new HashSet<>(); + Map> imagesPerPage = new HashMap<>(); + sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { + Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection()); + entities.addAll(analysedRowSection.getEntities()); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); - Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId()); - - List sectionSearchableTextPairs = new ArrayList<>(); - for (ReanalysisSection reanalysisSection : reanalysisSections) { - - Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection - .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); - if (reanalysisSection.getCellStarts() != null) { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection - .getCellStarts()); - } else { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); - } - - sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() - .isLocal(false) - .dictionaryTypes(dictionary.getTypes()) - .entities(entities) - .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) - .searchText(reanalysisSection.getSearchableText().toString()) - .headline(reanalysisSection.getHeadline()) - .sectionNumber(reanalysisSection.getSectionNumber()) - .tabularData(reanalysisSection.getTabularData()) - .searchableText(reanalysisSection.getSearchableText()) - .dictionary(dictionary) - .images(reanalysisSection.getImages()) - .build(), reanalysisSection.getSearchableText())); + for (Image image : analysedRowSection.getImages()) { + imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); } - Set entities = new HashSet<>(); - Map> imagesPerPage = new HashMap<>(); - sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { - Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair - .getSection()); - entities.addAll(analysedRowSection.getEntities()); - EntitySearchUtils.removeEntitiesContainedInLarger(entities); + }); - for (Image image : analysedRowSection.getImages()) { - imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); - } - - }); - - Map> entitiesPerPage = new HashMap<>(); - for (Entity entity : entities) { - Map> sequenceOnPage = new HashMap<>(); - for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { - sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) - .add(entityPositionSequence); - } - - for (Map.Entry> entry : sequenceOnPage.entrySet()) { - entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) - .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry - .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity - .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity - .getStart(), entity.getEnd())); - } + Map> entitiesPerPage = new HashMap<>(); + for (Entity entity : entities) { + Map> sequenceOnPage = new HashMap<>(); + for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) + .add(entityPositionSequence); } - List newRedactionLogEntries = new ArrayList<>(); - for (int page = 1; page <= pdDocument.getNumberOfPages(); page++) { - if (entitiesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest - .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); - } + for (Map.Entry> entry : sequenceOnPage.entrySet()) { + entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) + .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry + .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity + .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity + .getStart(), entity.getEnd(), entity.isDossierDictionaryEntry())); + } + } - if (imagesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest - .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); - } - - newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest + List newRedactionLogEntries = new ArrayList<>(); + for (int page = 1; page <= text.getNumberOfPages(); page++) { + if (entitiesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest.getManualRedactions(), page, analyzeRequest .getRuleSetId())); } - Iterator itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator(); - while (itty.hasNext()) { - RedactionLogEntry entry = itty.next(); - if (sectionsToReanaylse.contains(entry.getSectionNumber())) { - itty.remove(); - } + if (imagesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest.getManualRedactions(), page, analyzeRequest + .getRuleSetId())); } - renalyzeRequest.getRedactionLog().getRedactionLogEntry().addAll(newRedactionLogEntries); - - renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); - - return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build(); - - } catch (Exception e) { - throw new RedactionException(e); + newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest + .getRuleSetId())); } + redactionLog.getRedactionLogEntry() + .removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry + .isImage()); + redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); + return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement); + + } + + + private AnalyzeResult finalizeAnalysis(@RequestBody AnalyzeRequest analyzeRequest, long startTime, + RedactionLog redactionLog, Text text, + DictionaryIncrement dictionaryIncrement) { + + redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getRulesetVersion()); + redactionLog.setDossierDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getDossierVersion()); + + var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog); + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); + + long duration = System.currentTimeMillis() - startTime; + + return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), duration, text + .getNumberOfPages(), redactionLog, changeLog); } @@ -309,7 +281,7 @@ public class ReanalyzeService { return Image.builder() .type(entry.getType()) - .position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft() + .position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft() .getY(), position.getWidth(), position.getHeight())) .sectionNumber(entry.getSectionNumber()) .section(entry.getSection()) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionChangeLogService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionChangeLogService.java new file mode 100644 index 00000000..72bb7954 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionChangeLogService.java @@ -0,0 +1,93 @@ +package com.iqser.red.service.redaction.v1.server.redaction.service; + +import com.iqser.red.service.file.management.v1.api.model.FileType; +import com.iqser.red.service.redaction.v1.model.ChangeType; +import com.iqser.red.service.redaction.v1.model.RedactionChangeLog; +import com.iqser.red.service.redaction.v1.model.RedactionChangeLogEntry; +import com.iqser.red.service.redaction.v1.model.RedactionLog; +import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +@Slf4j +@Service +@RequiredArgsConstructor +public class RedactionChangeLogService { + + private final RedactionStorageService redactionStorageService; + + public RedactionChangeLog createAndStoreChangeLog(String projectId, String fileId, RedactionLog currentRedactionLog) { + + try { + RedactionLog previousRedactionLog = redactionStorageService.getRedactionLog(projectId, fileId); + var changeLog = createChangeLog(currentRedactionLog, previousRedactionLog); + redactionStorageService.storeObject(projectId, fileId, FileType.REDACTION_CHANGELOG, changeLog); + return changeLog; + } catch (Exception e) { + log.debug("Previous redaction log not available"); + return null; + } + + } + + + private RedactionChangeLog createChangeLog(RedactionLog currentRedactionLog, RedactionLog previousRedactionLog) { + + + if (previousRedactionLog == null) { + return null; + } + + List added = new ArrayList<>(currentRedactionLog.getRedactionLogEntry()); + added.removeAll(previousRedactionLog.getRedactionLogEntry()); + + List removed = new ArrayList<>(previousRedactionLog.getRedactionLogEntry()); + removed.removeAll(currentRedactionLog.getRedactionLogEntry()); + + List changeLogEntries = added.stream() + .map(entry -> convert(entry, ChangeType.ADDED)) + .collect(Collectors.toList()); + changeLogEntries.addAll(removed.stream() + .map(entry -> convert(entry, ChangeType.REMOVED)) + .collect(Collectors.toList())); + + return new RedactionChangeLog(changeLogEntries, currentRedactionLog.getDictionaryVersion(), currentRedactionLog.getRulesVersion(), currentRedactionLog + .getRuleSetId()); + } + + + private RedactionChangeLogEntry convert(RedactionLogEntry entry, ChangeType changeType) { + + return RedactionChangeLogEntry.builder() + .id(entry.getId()) + .type(entry.getType()) + .value(entry.getValue()) + .reason(entry.getReason()) + .matchedRule(entry.getMatchedRule()) + .legalBasis(entry.getLegalBasis()) + .redacted(entry.isRedacted()) + .isHint(entry.isHint()) + .isRecommendation(entry.isRecommendation()) + .section(entry.getSection()) + .color(entry.getColor()) + .positions(entry.getPositions()) + .sectionNumber(entry.getSectionNumber()) + .manual(entry.isManual()) + .status(entry.getStatus()) + .manualRedactionType(entry.getManualRedactionType()) + .isDictionaryEntry(entry.isDictionaryEntry()) + .textBefore(entry.getTextBefore()) + .textAfter(entry.getTextAfter()) + .comments(entry.getComments()) + .changeType(changeType) + .isDossierDictionaryEntry(entry.isDossierDictionaryEntry()) + .build(); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java index 44a8941a..4a591420 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java @@ -1,31 +1,10 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.commons.collections4.CollectionUtils; -import org.apache.pdfbox.text.TextPosition; -import org.springframework.stereotype.Service; - -import com.iqser.red.service.redaction.v1.model.CellRectangle; -import com.iqser.red.service.redaction.v1.model.Comment; -import com.iqser.red.service.redaction.v1.model.IdRemoval; -import com.iqser.red.service.redaction.v1.model.ManualForceRedact; -import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; -import com.iqser.red.service.redaction.v1.model.ManualRedactionType; -import com.iqser.red.service.redaction.v1.model.ManualRedactions; -import com.iqser.red.service.redaction.v1.model.Point; -import com.iqser.red.service.redaction.v1.model.Rectangle; -import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; -import com.iqser.red.service.redaction.v1.model.SectionRectangle; -import com.iqser.red.service.redaction.v1.model.Status; +import com.iqser.red.service.redaction.v1.model.*; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; @@ -34,8 +13,16 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; - import lombok.RequiredArgsConstructor; +import org.apache.commons.collections4.CollectionUtils; +import org.springframework.stereotype.Service; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; @Service @RequiredArgsConstructor @@ -285,24 +272,24 @@ public class RedactionLogCreatorService { } - private List getRectanglesPerLine(List textPositions, int page) { + private List getRectanglesPerLine(List textPositions, int page) { List rectangles = new ArrayList<>(); if (textPositions.size() == 1) { - rectangles.add(new TextPositionSequence(textPositions, page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle()); } else { float y = textPositions.get(0).getYDirAdj(); int startIndex = 0; for (int i = 1; i < textPositions.size(); i++) { float yDirAdj = textPositions.get(i).getYDirAdj(); if (yDirAdj != y) { - rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle()); y = yDirAdj; startIndex = i; } } if (startIndex != textPositions.size()) { - rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle()); } } @@ -368,6 +355,7 @@ public class RedactionLogCreatorService { .status(manualRedactionEntry.getStatus()) .manualRedactionType(ManualRedactionType.ADD) .isDictionaryEntry(false) + .isDossierDictionaryEntry(manualRedactionEntry.isAddToDossierDictionary()) .build(); } @@ -391,6 +379,7 @@ public class RedactionLogCreatorService { .textBefore(entity.getTextBefore()) .startOffset(entity.getStart()) .endOffset(entity.getEnd()) + .isDossierDictionaryEntry(entity.isDossierDictionaryEntry()) .build(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index f7fffaa4..f14eeb59 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -1,25 +1,17 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; - import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; +import java.util.*; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + @Slf4j @UtilityClass @SuppressWarnings("PMD") @@ -46,7 +38,7 @@ public class EntitySearchUtils { if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { - if(value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())){ + if (value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())) { return true; } } @@ -57,7 +49,7 @@ public class EntitySearchUtils { public Set find(String inputString, Set values, String type, String headline, int sectionNumber, - boolean local) { + boolean local, boolean isDossierDictionary) { Set found = new HashSet<>(); @@ -77,7 +69,7 @@ public class EntitySearchUtils { if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { - found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local)); + found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary)); } } while (startIndex > -1); } @@ -147,16 +139,16 @@ public class EntitySearchUtils { public void addEntitiesWithHigherRank(Set entities, Entity found, Dictionary dictionary) { - if(entities.contains(found)){ + if (entities.contains(found)) { Entity existing = entities.stream().filter(entity -> entity.equals(found)).findFirst().get(); - if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())){ + if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) { entities.remove(found); } } entities.add(found); } - public void addEntitiesIgnoreRank(Set entities, Set found){ + public void addEntitiesIgnoreRank(Set entities, Set found) { // HashSet keeps old value but we want the new. entities.removeAll(found); entities.addAll(found); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java index ce3c7540..241aa1be 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java @@ -1,15 +1,14 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; -import java.awt.geom.Rectangle2D; -import java.nio.charset.StandardCharsets; -import java.util.List; - import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; - +import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D; import lombok.experimental.UtilityClass; +import java.nio.charset.StandardCharsets; +import java.util.List; + @UtilityClass public class IdBuilder { @@ -26,14 +25,9 @@ public class IdBuilder { } - public String buildId(Rectangle2D rectangle2D, int page){ - - StringBuilder sb = new StringBuilder(); - sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page); - - return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString(); + public String buildId(RedRectangle2D rectangle2D, int page) { + return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString(); } - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/ResourceLoader.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/ResourceLoader.java index 5fc33bbd..accd2f30 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/ResourceLoader.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/ResourceLoader.java @@ -1,5 +1,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; +import lombok.experimental.UtilityClass; + import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; @@ -8,8 +10,6 @@ import java.nio.charset.StandardCharsets; import java.util.Set; import java.util.stream.Collectors; -import lombok.experimental.UtilityClass; - @UtilityClass public class ResourceLoader { @@ -27,4 +27,4 @@ public class ResourceLoader { } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java index 374eebd3..b740286b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java @@ -7,6 +7,7 @@ public class TextNormalizationUtilities { /** * Revert hyphenation due to line breaks. + * * @param text Text to be processed. * @return Text without line-break hyphenation. */ @@ -14,4 +15,4 @@ public class TextNormalizationUtilities { return text.replaceAll("([^\\s\\d\\-]{2,})[\\-\\u00AD]\\R|\n\r(.+ )", "$1$2"); } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 5d88c0cb..270f95e1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -1,88 +1,275 @@ package com.iqser.red.service.redaction.v1.server.segmentation; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.springframework.stereotype.Service; - import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService; +import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; +import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; - import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.springframework.stereotype.Service; + +import java.awt.Graphics; +import java.awt.geom.Rectangle2D; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; @Slf4j @Service @RequiredArgsConstructor public class PdfSegmentationService { + private final static int MAX_PAGES_BEFORE_GC = 250; + private final RulingCleaningService rulingCleaningService; private final TableExtractionService tableExtractionService; private final BlockificationService blockificationService; private final ClassificationService classificationService; private final SectionsBuilderService sectionsBuilderService; + private final ImageClassificationService imageClassificationService; - public Document parseDocument(PDDocument pdDocument) throws IOException { + public Document parseDocument(InputStream documentInputStream) throws IOException { + return parseDocument(documentInputStream, false); + } - Document document = new Document(); + public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException { + PDDocument pdDocument = null; + try { + //create tempFile + File tempFile = File.createTempFile("document", ".pdf"); + IOUtils.copy(documentInputStream, new FileOutputStream(tempFile)); - List pages = new ArrayList<>(); - PDFLinesTextStripper stripper = new PDFLinesTextStripper(); - for (int pageNumber = 1; pageNumber <= pdDocument.getNumberOfPages(); pageNumber++) { - PDPage pdPage = pdDocument.getPage(pageNumber - 1); - stripper.setPageNumber(pageNumber); - stripper.setStartPage(pageNumber); - stripper.setEndPage(pageNumber); - stripper.setPdpage(pdPage); - stripper.getText(pdDocument); + // initialize required variables + Document document = new Document(); + List pages = new ArrayList<>(); - PDRectangle pdr = pdPage.getMediaBox(); - boolean isLandscape = pdr.getWidth() > pdr.getHeight(); - int rotation = pdPage.getRotation(); - boolean isRotated = rotation != 0 && rotation != 360; + pdDocument = reinitializePDDocument(tempFile, null); + long pageCount = pdDocument.getNumberOfPages(); - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper - .getMaxCharHeight()); + for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { - Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings - .getVertical()); - page.setRotation(rotation); + if (pageNumber % MAX_PAGES_BEFORE_GC == 0) { + pdDocument = reinitializePDDocument(tempFile, pdDocument); + } - tableExtractionService.extractTables(cleanRulings, page); + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + PDPage pdPage = pdDocument.getPage(pageNumber - 1); + stripper.setPageNumber(pageNumber); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + stripper.getText(pdDocument); - buildPageStatistics(page); + PDRectangle pdr = pdPage.getMediaBox(); + boolean isLandscape = pdr.getWidth() > pdr.getHeight(); - page.setLandscape(isLandscape || isRotated); + int rotation = pdPage.getRotation(); + boolean isRotated = rotation != 0 && rotation != 360; - page.setPageNumber(pageNumber); - increaseDocumentStatistics(page, document); + CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper + .getMaxCharHeight()); - page.setImages(stripper.getImages()); - pages.add(page); + Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings + .getVertical()); + + page.setRotation(rotation); + page.setLandscape(isLandscape || isRotated); + page.setPageNumber(pageNumber); + List mergedList = processImages(stripper.getImages()); + page.setImages(mergedList); + + tableExtractionService.extractTables(cleanRulings, page); + buildPageStatistics(page); + increaseDocumentStatistics(page, document); + + + if (!ignoreImages) { + imageClassificationService.classifyImages(page); + } + + pages.add(page); + + + } + + document.setPages(pages); + + classificationService.classifyDocument(document); + sectionsBuilderService.buildSections(document); + sectionsBuilderService.addImagesToSections(document); + + pdDocument = reinitializePDDocument(tempFile, pdDocument); + + IOUtils.close(pdDocument); + + tempFile.delete(); + + return document; + } finally { + if (pdDocument != null) { + pdDocument.close(); + } + } + } + + private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException { + if (pdDocument != null) { + pdDocument.close(); + } + System.runFinalization(); + System.gc(); + + MemoryStats.printMemoryStats(); + + var newPDDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupTempFileOnly()); + newPDDocument.setAllSecurityToBeRemoved(true); + + return newPDDocument; + } + + //merge images, if they are separated during pdf import, return new list of Pdfimages + private List processImages(List imageList) { + if (imageList.size() > 1) { + List mergedList = new ArrayList<>(); + int countElementsInList = 0; + boolean beginImage = true; + + // a List of Boolean, true = candidate for merging, false = no merging + List candidatesList = getCandidatesList(imageList); + + // loop through list, if there are candidates for merging (true), merge images and add it to mergedList + for (int i = 0; i < candidatesList.size(); i++) { + if (candidatesList.get(i)) { + if (beginImage) { + //begin of image, merge two parts of imageList + PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1)); + // image merge successful + if (mergedImage != null) { + mergedList.add(mergedImage); + countElementsInList++; + } + } else { + //middle of an image, merge current piece auf mergedList with image of imageList + PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1)); + // image merge successful + if (mergedImage != null) { + mergedList.set(countElementsInList - 1, mergedImage); + } + } + beginImage = false; + } else { + // if the last candidate is false, then both images i and i+1 must be added + if (i == candidatesList.size() - 1) { + if (countElementsInList > 0 && mergedList.get(countElementsInList - 1) == imageList.get(i)) { + mergedList.add(imageList.get(i + 1)); + } else { + mergedList.add(imageList.get(i)); + mergedList.add(imageList.get(i + 1)); + } + } else { + //first image is not splitted, add i to resultlist + if (beginImage) { + mergedList.add(imageList.get(i)); + countElementsInList++; + } else { + // i is the end of an image, add begin of new image + mergedList.add(imageList.get(i + 1)); + countElementsInList++; + beginImage = false; + } + } + } + } + return mergedList; + } else { + return imageList; + } + } + + private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2) { + + // diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten + double width = image1.getPosition().getWidth(); + double height1 = image1.getPosition().getHeight(); + double height2 = image2.getPosition().getHeight(); + // mit den Werten, die unter Image gespeichert sind, funktioniert es + double img1height = image1.getImage().getHeight(); + double img1width = image1.getImage().getWidth(); + double img2height = image2.getImage().getHeight(); + + BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height + img2height), BufferedImage.TYPE_INT_RGB); + Graphics mergedImageGraphics = mergedImage.getGraphics(); + try { + mergedImageGraphics.drawImage(image1.getImage(), 0, 0, null); + mergedImageGraphics.drawImage(image2.getImage(), 0, (int) (img1height), null); + + // set Image, Position and type for merged Image + //set position for merged image with values of image1 and the height of both + Rectangle2D pos = new Rectangle2D.Float(); + pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), width, height1 + height2); + PdfImage newPdfImage = new PdfImage(mergedImage, pos, image1.getPage()); + // Graphics need to be disposed + + image1.getImage().flush(); + image2.getImage().flush(); + + mergedImage.flush(); + mergedImageGraphics.dispose(); + + return newPdfImage; + } catch (Exception e) { + // failed to merge image + log.error("Failed to merge image", e); + return null; } - document.setPages(pages); - classificationService.classifyDocument(document); - sectionsBuilderService.buildSections(document); - sectionsBuilderService.addImagesToSections(document); + } - return document; + //make a list of true and false, if the image is a candidate for merging + private List getCandidatesList(List imageList) { + List candidatesList = new ArrayList<>(); + for (int i = 0; i < imageList.size(); i++) { + if (i >= 1) { + candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i))); + } + } + return candidatesList; + } + + // evaluate if two images are candidates for merging, depending on their coordinates, width and height + private boolean isCandidateForMerging(PdfImage image1, PdfImage image2) { + double x1 = image1.getPosition().getX(); + double y1 = image1.getPosition().getY(); + double width1 = image1.getPosition().getWidth(); + double x2 = image2.getPosition().getX(); + double y2 = image2.getPosition().getY(); + double width2 = image2.getPosition().getWidth(); + double height2 = image2.getPosition().getHeight(); + //if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates, + // then it is the same picture and has to be merged -> return true + return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(y1 - y2) && width2 > (height2 / 6); } @@ -116,4 +303,5 @@ public class PdfSegmentationService { } -} \ No newline at end of file + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 41a8cf6f..7ee5a4c8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -1,29 +1,15 @@ package com.iqser.red.service.redaction.v1.server.segmentation; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.SortedSet; -import java.util.TreeSet; -import java.util.stream.Collectors; - -import org.apache.commons.collections4.CollectionUtils; -import org.springframework.stereotype.Service; - -import com.iqser.red.service.redaction.v1.server.classification.model.Document; -import com.iqser.red.service.redaction.v1.server.classification.model.Footer; -import com.iqser.red.service.redaction.v1.server.classification.model.Header; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText; +import com.iqser.red.service.redaction.v1.server.classification.model.*; import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; +import org.apache.commons.collections4.CollectionUtils; +import org.springframework.stereotype.Service; + +import java.util.*; +import java.util.stream.Collectors; @Service public class SectionsBuilderService { @@ -121,6 +107,20 @@ public class SectionsBuilderService { } } + if (paragraphMap.isEmpty()) { + Paragraph paragraph = new Paragraph(); + document.getParagraphs().add(paragraph); + paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph); + } + + // first page is always a paragraph, else we can't process pages 1..N, + // where N is the first found page with a paragraph + if (paragraphMap.get(1) == null) { + Paragraph paragraph = new Paragraph(); + document.getParagraphs().add(paragraph); + paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph); + } + for (Page page : document.getPages()) { for (PdfImage image : page.getImages()) { SortedSet paragraphsOnPage = paragraphMap.get(page.getPageNumber()); @@ -296,4 +296,4 @@ public class SectionsBuilderService { } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java index eb57a0f6..f7c9f894 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java @@ -1,17 +1,16 @@ package com.iqser.red.service.redaction.v1.server.settings; -import org.springframework.boot.context.properties.ConfigurationProperties; - import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; @Data @ConfigurationProperties("redaction-service") public class RedactionServiceSettings { - + private int numberOfSurroundingWords = 3; private int surroundingWordsOffsetWindow = 100; private boolean enableImageClassification = true; -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java new file mode 100644 index 00000000..f350b7ac --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java @@ -0,0 +1,103 @@ +package com.iqser.red.service.redaction.v1.server.storage; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.file.management.v1.api.model.FileType; +import com.iqser.red.service.redaction.v1.model.RedactionLog; +import com.iqser.red.service.redaction.v1.model.SectionGrid; +import com.iqser.red.service.redaction.v1.server.classification.model.Text; +import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist; +import com.iqser.red.storage.commons.service.StorageService; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.springframework.core.io.InputStreamResource; +import org.springframework.stereotype.Service; + +import java.io.IOException; +import java.io.InputStream; + +@Slf4j +@Service +@RequiredArgsConstructor +public class RedactionStorageService { + + private final ObjectMapper objectMapper; + private final StorageService storageService; + + @SneakyThrows + public InputStream getStoredObject(String storageId) { + return storageService.getObject(storageId).getInputStream(); + } + + + @SneakyThrows + public void storeObject(String projectId, String fileId, FileType fileType, Object any) { + storageService.storeObject(StorageIdUtils.getStorageId(projectId, fileId, fileType), objectMapper.writeValueAsBytes(any)); + } + + + public RedactionLog getRedactionLog(String projectId, String fileId) { + + InputStreamResource inputStreamResource; + try { + inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.REDACTION_LOG)); + } catch (StorageObjectDoesNotExist e) { + log.debug("Text not available."); + return null; + } + + try { + return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class); + } catch (IOException e) { + throw new RuntimeException("Could not convert RedactionLog", e); + } + } + + + public Text getText(String projectId, String fileId) { + + InputStreamResource inputStreamResource; + try { + inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.TEXT)); + } catch (StorageObjectDoesNotExist e) { + log.debug("Text not available."); + return null; + } + + try { + return objectMapper.readValue(inputStreamResource.getInputStream(), Text.class); + } catch (IOException e) { + throw new RuntimeException("Could not convert Text", e); + } + } + + + public SectionGrid getSectionGrid(String projectId, String fileId) { + + var sectionGrid = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.SECTION_GRID)); + try { + return objectMapper.readValue(sectionGrid.getInputStream(), SectionGrid.class); + } catch (IOException e) { + throw new RuntimeException("Could not convert RedactionLog", e); + } + } + + + @RequiredArgsConstructor + public enum StorageType { + PARSED_DOCUMENT(".json"); + + @Getter + private final String extension; + + } + + public static class StorageIdUtils { + + public static String getStorageId(String projectId, String fileId, FileType fileType) { + return projectId + "/" + fileId + "." + fileType.name() + fileType.getExtension(); + } + + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java index faa2b690..b050e27b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java @@ -1,7 +1,7 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.model.Rectangle; - import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; @@ -25,15 +25,17 @@ public abstract class AbstractTextContainer { } public boolean contains(Rectangle other) { - return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight(); + return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight(); } + @JsonIgnore public float getHeight() { return maxY - minY; } - + + @JsonIgnore public float getWidth() { return maxX - minX; } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java index b6efcb3f..e14f5da0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java @@ -1,18 +1,17 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; + import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; - -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; - @SuppressWarnings("serial") @Data @EqualsAndHashCode(callSuper = true) @@ -71,7 +70,4 @@ public class Cell extends Rectangle { } - - - -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CleanRulings.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CleanRulings.java index f1a72a20..73fb9e13 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CleanRulings.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CleanRulings.java @@ -1,10 +1,10 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; -import java.util.List; - import lombok.Builder; import lombok.Data; +import java.util.List; + @Data @Builder public class CleanRulings { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Rectangle.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Rectangle.java index d897840f..2b1b7509 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Rectangle.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Rectangle.java @@ -8,170 +8,171 @@ import java.util.List; @SuppressWarnings("all") public class Rectangle extends Rectangle2D.Float { - /** - * Ill-defined comparator, from when Rectangle was Comparable. - * - * see https://github.com/tabulapdf/tabula-java/issues/116 - * @deprecated with no replacement - */ - @Deprecated - public static final Comparator ILL_DEFINED_ORDER = new Comparator() { - @Override public int compare(Rectangle o1, Rectangle o2) { - if (o1.equals(o2)) return 0; - if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) { - return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 - ? - java.lang.Double.compare(o1.getX(), o2.getX()) - : java.lang.Double.compare(o1.getX(), o2.getX()); - } else { - return java.lang.Float.compare(o1.getBottom(), o2.getBottom()); - } - } - }; - - protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; + protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; + /** + * Ill-defined comparator, from when Rectangle was Comparable. + *

+ * see https://github.com/tabulapdf/tabula-java/issues/116 + * + * @deprecated with no replacement + */ + @Deprecated + public static final Comparator ILL_DEFINED_ORDER = new Comparator() { + @Override + public int compare(Rectangle o1, Rectangle o2) { + if (o1.equals(o2)) return 0; + if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) { + return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 + ? -java.lang.Double.compare(o1.getX(), o2.getX()) + : java.lang.Double.compare(o1.getX(), o2.getX()); + } else { + return java.lang.Float.compare(o1.getBottom(), o2.getBottom()); + } + } + }; - public Rectangle() { - super(); - } + public Rectangle() { + super(); + } - public Rectangle(float top, float left, float width, float height) { - super(); - this.setRect(left, top, width, height); - } + public Rectangle(float top, float left, float width, float height) { + super(); + this.setRect(left, top, width, height); + } - public int compareTo(Rectangle other) { - return ILL_DEFINED_ORDER.compare(this, other); - } + /** + * @param rectangles + * @return minimum bounding box that contains all the rectangles + */ + public static Rectangle boundingBoxOf(List rectangles) { + float minx = java.lang.Float.MAX_VALUE; + float miny = java.lang.Float.MAX_VALUE; + float maxx = java.lang.Float.MIN_VALUE; + float maxy = java.lang.Float.MIN_VALUE; - // I'm bad at Java and need this for fancy sorting in - // technology.tabula.TextChunk. - public int isLtrDominant() { - return 0; - } + for (Rectangle r : rectangles) { + minx = (float) Math.min(r.getMinX(), minx); + miny = (float) Math.min(r.getMinY(), miny); + maxx = (float) Math.max(r.getMaxX(), maxx); + maxy = (float) Math.max(r.getMaxY(), maxy); + } + return new Rectangle(miny, minx, maxx - minx, maxy - miny); + } - public float getArea() { - return this.width * this.height; - } + public int compareTo(Rectangle other) { + return ILL_DEFINED_ORDER.compare(this, other); + } - public float verticalOverlap(Rectangle other) { - return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop())); - } + // I'm bad at Java and need this for fancy sorting in + // technology.tabula.TextChunk. + public int isLtrDominant() { + return 0; + } - public boolean verticallyOverlaps(Rectangle other) { - return verticalOverlap(other) > 0; - } + public float getArea() { + return this.width * this.height; + } - public float horizontalOverlap(Rectangle other) { - return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft())); - } + public float verticalOverlap(Rectangle other) { + return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop())); + } - public boolean horizontallyOverlaps(Rectangle other) { - return horizontalOverlap(other) > 0; - } + public boolean verticallyOverlaps(Rectangle other) { + return verticalOverlap(other) > 0; + } - public float verticalOverlapRatio(Rectangle other) { - float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop()); + public float horizontalOverlap(Rectangle other) { + return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft())); + } - if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() - && other.getBottom() <= this.getBottom()) { - rv = (other.getBottom() - this.getTop()) / delta; - } else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() - && this.getBottom() <= other.getBottom()) { - rv = (this.getBottom() - other.getTop()) / delta; - } else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() - && other.getBottom() <= this.getBottom()) { - rv = (other.getBottom() - other.getTop()) / delta; - } else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() - && this.getBottom() <= other.getBottom()) { - rv = (this.getBottom() - this.getTop()) / delta; - } + public boolean horizontallyOverlaps(Rectangle other) { + return horizontalOverlap(other) > 0; + } - return rv; + public float verticalOverlapRatio(Rectangle other) { + float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop()); - } + if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() + && other.getBottom() <= this.getBottom()) { + rv = (other.getBottom() - this.getTop()) / delta; + } else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() + && this.getBottom() <= other.getBottom()) { + rv = (this.getBottom() - other.getTop()) / delta; + } else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() + && other.getBottom() <= this.getBottom()) { + rv = (other.getBottom() - other.getTop()) / delta; + } else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() + && this.getBottom() <= other.getBottom()) { + rv = (this.getBottom() - this.getTop()) / delta; + } - public float overlapRatio(Rectangle other) { - double intersectionWidth = Math.max(0, - Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft())); - double intersectionHeight = Math.max(0, - Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop())); - double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight); - double unionArea = this.getArea() + other.getArea() - intersectionArea; + return rv; - return (float) (intersectionArea / unionArea); - } + } - public Rectangle merge(Rectangle other) { - this.setRect(this.createUnion(other)); - return this; - } + public float overlapRatio(Rectangle other) { + double intersectionWidth = Math.max(0, + Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft())); + double intersectionHeight = Math.max(0, + Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop())); + double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight); + double unionArea = this.getArea() + other.getArea() - intersectionArea; - public float getTop() { - return (float) this.getMinY(); - } + return (float) (intersectionArea / unionArea); + } - public void setTop(float top) { - float deltaHeight = top - this.y; - this.setRect(this.x, top, this.width, this.height - deltaHeight); - } + public Rectangle merge(Rectangle other) { + this.setRect(this.createUnion(other)); + return this; + } - public float getRight() { - return (float) this.getMaxX(); - } + public float getTop() { + return (float) this.getMinY(); + } - public void setRight(float right) { - this.setRect(this.x, this.y, right - this.x, this.height); - } + public void setTop(float top) { + float deltaHeight = top - this.y; + this.setRect(this.x, top, this.width, this.height - deltaHeight); + } - public float getLeft() { - return (float) this.getMinX(); - } + public float getRight() { + return (float) this.getMaxX(); + } - public void setLeft(float left) { - float deltaWidth = left - this.x; - this.setRect(left, this.y, this.width - deltaWidth, this.height); - } + public void setRight(float right) { + this.setRect(this.x, this.y, right - this.x, this.height); + } - public float getBottom() { - return (float) this.getMaxY(); - } + public float getLeft() { + return (float) this.getMinX(); + } - public void setBottom(float bottom) { - this.setRect(this.x, this.y, this.width, bottom - this.y); - } + public void setLeft(float left) { + float deltaWidth = left - this.x; + this.setRect(left, this.y, this.width - deltaWidth, this.height); + } - public Point2D[] getPoints() { - return new Point2D[] { new Point2D.Float(this.getLeft(), this.getTop()), - new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()), - new Point2D.Float(this.getLeft(), this.getBottom()) }; - } + public float getBottom() { + return (float) this.getMaxY(); + } - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - String s = super.toString(); - sb.append(s.substring(0, s.length() - 1)); - sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight())); - return sb.toString(); - } + public void setBottom(float bottom) { + this.setRect(this.x, this.y, this.width, bottom - this.y); + } - /** - * @param rectangles - * @return minimum bounding box that contains all the rectangles - */ - public static Rectangle boundingBoxOf(List rectangles) { - float minx = java.lang.Float.MAX_VALUE; - float miny = java.lang.Float.MAX_VALUE; - float maxx = java.lang.Float.MIN_VALUE; - float maxy = java.lang.Float.MIN_VALUE; + public Point2D[] getPoints() { + return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), + new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()), + new Point2D.Float(this.getLeft(), this.getBottom())}; + } - for (Rectangle r : rectangles) { - minx = (float) Math.min(r.getMinX(), minx); - miny = (float) Math.min(r.getMinY(), miny); - maxx = (float) Math.max(r.getMaxX(), maxx); - maxy = (float) Math.max(r.getMaxY(), maxy); - } - return new Rectangle(miny, minx, maxx - minx, maxy - miny); - } + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + String s = super.toString(); + sb.append(s.substring(0, s.length() - 1)); + sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight())); + return sb.toString(); + } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/RectangleSpatialIndex.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/RectangleSpatialIndex.java index 79f08ec4..404b66e9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/RectangleSpatialIndex.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/RectangleSpatialIndex.java @@ -1,12 +1,11 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; -import java.util.ArrayList; -import java.util.List; - +import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; import org.locationtech.jts.geom.Envelope; import org.locationtech.jts.index.strtree.STRtree; -import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; +import java.util.ArrayList; +import java.util.List; @SuppressWarnings("all") public class RectangleSpatialIndex { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Ruling.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Ruling.java index 98e3b300..e90c52b2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Ruling.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Ruling.java @@ -1,20 +1,13 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; +import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping; +import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; +import lombok.extern.slf4j.Slf4j; + import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.Formatter; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping; -import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; - -import lombok.extern.slf4j.Slf4j; +import java.util.*; @Slf4j @@ -23,13 +16,127 @@ public class Ruling extends Line2D.Float { private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2; - private enum SOType {VERTICAL, HRIGHT, HLEFT} - - public Ruling(Point2D p1, Point2D p2) { super(p1, p2); } + public static List cropRulingsToArea(List rulings, Rectangle2D area) { + ArrayList rv = new ArrayList<>(); + for (Ruling r : rulings) { + if (r.intersects(area)) { + rv.add(r.intersect(area)); + } + } + return rv; + } + + // log(n) implementation of find_intersections + // based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf + public static Map findIntersections(List horizontals, List verticals) { + + class SortObject { + protected SOType type; + protected float position; + protected Ruling ruling; + + public SortObject(SOType type, float position, Ruling ruling) { + this.type = type; + this.position = position; + this.ruling = ruling; + } + } + + List sos = new ArrayList<>(); + + TreeMap tree = new TreeMap<>(new Comparator() { + @Override + public int compare(Ruling o1, Ruling o2) { + return java.lang.Double.compare(o1.getTop(), o2.getTop()); + } + }); + + TreeMap rv = new TreeMap<>(new Comparator() { + @Override + public int compare(Point2D o1, Point2D o2) { + if (o1.getY() > o2.getY()) { + return 1; + } + if (o1.getY() < o2.getY()) { + return -1; + } + if (o1.getX() > o2.getX()) { + return 1; + } + if (o1.getX() < o2.getX()) { + return -1; + } + return 0; + } + }); + + for (Ruling h : horizontals) { + sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h)); + sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h)); + } + + for (Ruling v : verticals) { + sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v)); + } + + Collections.sort(sos, new Comparator() { + @Override + public int compare(SortObject a, SortObject b) { + int rv; + if (Utils.feq(a.position, b.position)) { + if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) { + rv = 1; + } else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) { + rv = -1; + } else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) { + rv = -1; + } else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) { + rv = 1; + } else { + rv = java.lang.Double.compare(a.position, b.position); + } + } else { + return java.lang.Double.compare(a.position, b.position); + } + return rv; + } + }); + + for (SortObject so : sos) { + switch (so.type) { + case VERTICAL: + for (Map.Entry h : tree.entrySet()) { + try { + Point2D i = h.getKey().intersectionPoint(so.ruling); + if (i == null) { + continue; + } + rv.put(i, + new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), + so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)}); + } catch (UnsupportedOperationException e) { + log.info("Some line are oblique, ignoring..."); + continue; + } + } + break; + case HRIGHT: + tree.remove(so.ruling); + break; + case HLEFT: + tree.put(so.ruling, true); + break; + } + } + + return rv; + + } + public boolean vertical() { return this.length() > 0 && Utils.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD; } @@ -38,13 +145,13 @@ public class Ruling extends Line2D.Float { return this.length() > 0 && Utils.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD; } + // attributes that make sense only for non-oblique lines + // these are used to have a single collapse method (in page, currently) + public boolean oblique() { return !(this.vertical() || this.horizontal()); } - // attributes that make sense only for non-oblique lines - // these are used to have a single collapse method (in page, currently) - public float getPosition() { if (this.oblique()) { throw new UnsupportedOperationException(); @@ -52,7 +159,6 @@ public class Ruling extends Line2D.Float { return this.vertical() ? this.getLeft() : this.getTop(); } - public float getStart() { if (this.oblique()) { throw new UnsupportedOperationException(); @@ -102,12 +208,10 @@ public class Ruling extends Line2D.Float { } } - public boolean perpendicularTo(Ruling other) { return this.vertical() == other.horizontal(); } - public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) { if (this.intersectsLine(another)) { return true; @@ -238,7 +342,6 @@ public class Ruling extends Line2D.Float { return angle; } - @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -248,122 +351,7 @@ public class Ruling extends Line2D.Float { return rv; } - public static List cropRulingsToArea(List rulings, Rectangle2D area) { - ArrayList rv = new ArrayList<>(); - for (Ruling r : rulings) { - if (r.intersects(area)) { - rv.add(r.intersect(area)); - } - } - return rv; - } - - // log(n) implementation of find_intersections - // based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf - public static Map findIntersections(List horizontals, List verticals) { - - class SortObject { - protected SOType type; - protected float position; - protected Ruling ruling; - - public SortObject(SOType type, float position, Ruling ruling) { - this.type = type; - this.position = position; - this.ruling = ruling; - } - } - - List sos = new ArrayList<>(); - - TreeMap tree = new TreeMap<>(new Comparator() { - @Override - public int compare(Ruling o1, Ruling o2) { - return java.lang.Double.compare(o1.getTop(), o2.getTop()); - } - }); - - TreeMap rv = new TreeMap<>(new Comparator() { - @Override - public int compare(Point2D o1, Point2D o2) { - if (o1.getY() > o2.getY()) { - return 1; - } - if (o1.getY() < o2.getY()) { - return -1; - } - if (o1.getX() > o2.getX()) { - return 1; - } - if (o1.getX() < o2.getX()) { - return -1; - } - return 0; - } - }); - - for (Ruling h : horizontals) { - sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h)); - sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h)); - } - - for (Ruling v : verticals) { - sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v)); - } - - Collections.sort(sos, new Comparator() { - @Override - public int compare(SortObject a, SortObject b) { - int rv; - if (Utils.feq(a.position, b.position)) { - if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) { - rv = 1; - } else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) { - rv = -1; - } else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) { - rv = -1; - } else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) { - rv = 1; - } else { - rv = java.lang.Double.compare(a.position, b.position); - } - } else { - return java.lang.Double.compare(a.position, b.position); - } - return rv; - } - }); - - for (SortObject so : sos) { - switch (so.type) { - case VERTICAL: - for (Map.Entry h : tree.entrySet()) { - try { - Point2D i = h.getKey().intersectionPoint(so.ruling); - if (i == null) { - continue; - } - rv.put(i, - new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), - so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)}); - } catch(UnsupportedOperationException e){ - log.info("Some line are oblique, ignoring..."); - continue; - } - } - break; - case HRIGHT: - tree.remove(so.ruling); - break; - case HLEFT: - tree.put(so.ruling, true); - break; - } - } - - return rv; - - } + private enum SOType {VERTICAL, HRIGHT, HLEFT} } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java index 8f55b482..6abc086e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java @@ -1,22 +1,13 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -import org.apache.commons.collections4.CollectionUtils; - import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; - import lombok.Getter; import lombok.Setter; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.collections4.CollectionUtils; + +import java.util.*; @Slf4j public class Table extends AbstractTextContainer { @@ -24,21 +15,14 @@ public class Table extends AbstractTextContainer { private final TreeMap cells = new TreeMap<>(); private final RectangleSpatialIndex si = new RectangleSpatialIndex<>(); - + private final int rotation; @Getter @Setter private String headline; - private int unrotatedRowCount; - private int unrotatedColCount; - private int rowCount = -1; - private int colCount = -1; - - private final int rotation; - private List> rows; @@ -62,8 +46,8 @@ public class Table extends AbstractTextContainer { // Ignore rows that does not contain any cells and values. List> rowsToRemove = new ArrayList<>(); - for (List row: rows){ - if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()){ + for (List row : rows) { + if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) { rowsToRemove.add(row); } } @@ -110,7 +94,7 @@ public class Table extends AbstractTextContainer { // we move from left to right and top to bottom for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { List rowCells = rows.get(rowIndex); - if(rowCells.size() == 1){ + if (rowCells.size() == 1) { continue; } @@ -275,7 +259,7 @@ public class Table extends AbstractTextContainer { cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2), Utils.round(arg1 - .getBottom(), 2)))); + .getBottom(), 2)))); Iterator iter = cells.iterator(); Cell c = iter.next(); @@ -367,4 +351,4 @@ public class Table extends AbstractTextContainer { return sb.toString(); } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java index 82ca3bb7..6f6ea80a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java @@ -1,19 +1,13 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.service; -import java.awt.geom.Line2D; -import java.awt.geom.Point2D; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.springframework.stereotype.Service; - import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; +import org.springframework.stereotype.Service; + +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.util.*; @Service public class RulingCleaningService { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java index 3dddd34a..682eb03e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java @@ -1,31 +1,57 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.service; -import java.awt.geom.Point2D; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import org.springframework.stereotype.Service; - import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.*; import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; +import org.springframework.stereotype.Service; + +import java.awt.geom.Point2D; +import java.util.*; +import java.util.stream.Collectors; @Service public class TableExtractionService { + private static final Comparator X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> { + + int rv = 0; + float arg0X = Utils.round(arg0.getX(), 2); + float arg0Y = Utils.round(arg0.getY(), 2); + float arg1X = Utils.round(arg1.getX(), 2); + float arg1Y = Utils.round(arg1.getY(), 2); + + if (arg0X > arg1X) { + rv = 1; + } else if (arg0X < arg1X) { + rv = -1; + } else if (arg0Y > arg1Y) { + rv = 1; + } else if (arg0Y < arg1Y) { + rv = -1; + } + return rv; + }; + private static final Comparator POINT_COMPARATOR = (arg0, arg1) -> { + + int rv = 0; + float arg0X = Utils.round(arg0.getX(), 2); + float arg0Y = Utils.round(arg0.getY(), 2); + float arg1X = Utils.round(arg1.getX(), 2); + float arg1Y = Utils.round(arg1.getY(), 2); + + if (arg0Y > arg1Y) { + rv = 1; + } else if (arg0Y < arg1Y) { + rv = -1; + } else if (arg0X > arg1X) { + rv = 1; + } else if (arg0X < arg1X) { + rv = -1; + } + return rv; + }; + public void extractTables(CleanRulings cleanRulings, Page page) { List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); @@ -80,7 +106,6 @@ public class TableExtractionService { page.getTextBlocks().removeAll(toBeRemoved); } - public List findCells(List horizontalRulingLines, List verticalRulingLines) { List cellsFound = new ArrayList<>(); @@ -133,7 +158,6 @@ public class TableExtractionService { return cellsFound; } - private List findSpreadsheetsFromCells(List cells) { // via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon List rectangles = new ArrayList<>(); @@ -233,47 +257,6 @@ public class TableExtractionService { return rectangles; } - - private static final Comparator X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> { - - int rv = 0; - float arg0X = Utils.round(arg0.getX(), 2); - float arg0Y = Utils.round(arg0.getY(), 2); - float arg1X = Utils.round(arg1.getX(), 2); - float arg1Y = Utils.round(arg1.getY(), 2); - - if (arg0X > arg1X) { - rv = 1; - } else if (arg0X < arg1X) { - rv = -1; - } else if (arg0Y > arg1Y) { - rv = 1; - } else if (arg0Y < arg1Y) { - rv = -1; - } - return rv; - }; - - private static final Comparator POINT_COMPARATOR = (arg0, arg1) -> { - - int rv = 0; - float arg0X = Utils.round(arg0.getX(), 2); - float arg0Y = Utils.round(arg0.getY(), 2); - float arg1X = Utils.round(arg1.getX(), 2); - float arg1Y = Utils.round(arg1.getY(), 2); - - if (arg0Y > arg1Y) { - rv = 1; - } else if (arg0Y < arg1Y) { - rv = -1; - } else if (arg0X > arg1X) { - rv = 1; - } else if (arg0X < arg1X) { - rv = -1; - } - return rv; - }; - private enum Direction { HORIZONTAL, VERTICAL } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/CohenSutherlandClipping.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/CohenSutherlandClipping.java index d1f9ab06..bd4b9d0c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/CohenSutherlandClipping.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/CohenSutherlandClipping.java @@ -19,21 +19,24 @@ import java.awt.geom.Rectangle2D; * clipping algorithm (line against clip rectangle). */ @SuppressWarnings("all") -public final class CohenSutherlandClipping -{ +public final class CohenSutherlandClipping { + private static final int INSIDE = 0; + private static final int LEFT = 1; + private static final int RIGHT = 2; + private static final int BOTTOM = 4; + private static final int TOP = 8; private double xMin; private double yMin; private double xMax; private double yMax; - /** * Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0). */ public CohenSutherlandClipping() { } - /** * Creates a Cohen Sutherland clipper with the given clip rectangle. + * * @param clip the clip rectangle to use */ public CohenSutherlandClipping(Rectangle2D clip) { @@ -42,6 +45,7 @@ public final class CohenSutherlandClipping /** * Sets the clip rectangle. + * * @param clip the clip rectangle */ public void setClip(Rectangle2D clip) { @@ -51,19 +55,13 @@ public final class CohenSutherlandClipping yMax = yMin + clip.getHeight(); } - private static final int INSIDE = 0; - private static final int LEFT = 1; - private static final int RIGHT = 2; - private static final int BOTTOM = 4; - private static final int TOP = 8; - private final int regionCode(double x, double y) { - int code = x < xMin - ? LEFT - : x > xMax + int code = x < xMin + ? LEFT + : x > xMax ? RIGHT : INSIDE; - if (y < yMin) code |= BOTTOM; + if (y < yMin) code |= BOTTOM; else if (y > yMax) code |= TOP; return code; } @@ -71,6 +69,7 @@ public final class CohenSutherlandClipping /** * Clips a given line against the clip rectangle. * The modification (if needed) is done in place. + * * @param line the line to clip * @return true if line is clipped, false if line is * totally outside the clip rect. @@ -87,9 +86,9 @@ public final class CohenSutherlandClipping boolean vertical = p1x == p2x; - double slope = vertical - ? 0d - : (p2y-p1y)/(p2x-p1x); + double slope = vertical + ? 0d + : (p2y - p1y) / (p2x - p1x); int c1 = regionCode(p1x, p1y); int c2 = regionCode(p2x, p2y); @@ -103,31 +102,27 @@ public final class CohenSutherlandClipping if ((c & LEFT) != INSIDE) { qx = xMin; - qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y; - } - else if ((c & RIGHT) != INSIDE) { + qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y; + } else if ((c & RIGHT) != INSIDE) { qx = xMax; - qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y; - } - else if ((c & BOTTOM) != INSIDE) { + qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y; + } else if ((c & BOTTOM) != INSIDE) { qy = yMin; qx = vertical - ? p1x - : (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x; - } - else if ((c & TOP) != INSIDE) { + ? p1x + : (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x; + } else if ((c & TOP) != INSIDE) { qy = yMax; qx = vertical - ? p1x - : (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x; + ? p1x + : (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x; } if (c == c1) { p1x = qx; p1y = qy; - c1 = regionCode(p1x, p1y); - } - else { + c1 = regionCode(p1x, p1y); + } else { p2x = qx; p2y = qy; c2 = regionCode(p2x, p2y); @@ -137,4 +132,4 @@ public final class CohenSutherlandClipping return true; } } -// end of file \ No newline at end of file +// end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/QuickSort.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/QuickSort.java index 5b9c3b6c..909de599 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/QuickSort.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/QuickSort.java @@ -10,11 +10,6 @@ import java.util.List; */ public final class QuickSort { - private QuickSort() { - - } - - private static final Comparator OBJCOMP = new Comparator() { @Override public int compare(Comparable object1, Comparable object2) { @@ -24,6 +19,10 @@ public final class QuickSort { }; + private QuickSort() { + + } + /** * Sorts the given list using the given comparator. * diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/Utils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/Utils.java index 62f72434..2a95ec3b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/Utils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/Utils.java @@ -1,11 +1,11 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.utils; +import lombok.extern.slf4j.Slf4j; + import java.math.BigDecimal; import java.util.Comparator; import java.util.List; -import lombok.extern.slf4j.Slf4j; - @Slf4j @SuppressWarnings("all") public class Utils { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfVisualisationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfVisualisationService.java index 43e2cf13..06ccb399 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfVisualisationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfVisualisationService.java @@ -1,15 +1,5 @@ package com.iqser.red.service.redaction.v1.server.visualization.service; -import java.awt.Color; -import java.io.IOException; -import java.util.List; - -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.font.PDType1Font; -import org.springframework.stereotype.Service; - import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; @@ -17,9 +7,17 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; - import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.springframework.stereotype.Service; + +import java.awt.Color; +import java.io.IOException; +import java.util.List; @Slf4j @Service @@ -34,7 +32,7 @@ public class PdfVisualisationService { PDPage pdPage = document.getPage(page - 1); PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); - for(Paragraph paragraph : classifiedDoc.getParagraphs()) { + for (Paragraph paragraph : classifiedDoc.getParagraphs()) { for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) { @@ -44,10 +42,10 @@ public class PdfVisualisationService { continue; } if (textBlock instanceof TextBlock) { - textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size()); + textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size()); visualizeTextBlock((TextBlock) textBlock, contentStream); } else if (textBlock instanceof Table) { - textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size()); + textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size()); visualizeTable((Table) textBlock, contentStream); } @@ -59,7 +57,6 @@ public class PdfVisualisationService { } - public void visualizeClassifications(Document classifiedDoc, PDDocument document) throws IOException { for (int page = 1; page <= document.getNumberOfPages(); page++) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/resources/application-dev.yaml b/redaction-service-v1/redaction-service-server-v1/src/main/resources/application-dev.yaml index 302d198a..2d266963 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/resources/application-dev.yaml +++ b/redaction-service-v1/redaction-service-server-v1/src/main/resources/application-dev.yaml @@ -1,4 +1,11 @@ server: port: 8083 -configuration-service.url: "http://localhost:8081" \ No newline at end of file +configuration-service.url: "http://localhost:8081" +file-management-service.url: "http://localhost:8085" + +storage: + bucket-name: 'redaction' + endpoint: 'http://localhost:9000' + key: minioadmin + secret: minioadmin diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/resources/application.yml b/redaction-service-v1/redaction-service-server-v1/src/main/resources/application.yml index efb01d6f..671d3b20 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/resources/application.yml +++ b/redaction-service-v1/redaction-service-server-v1/src/main/resources/application.yml @@ -2,6 +2,7 @@ info: description: Redaction Service Server V1 configuration-service.url: "http://configuration-service-v1:8080" +file-management-service.url: "http://file-management-service-v1:8080" image-service.url: "http://image-service-v1:8080" server: @@ -10,6 +11,20 @@ server: spring: profiles: active: kubernetes + rabbitmq: + host: ${RABBITMQ_HOST:localhost} + port: ${RABBITMQ_PORT:5672} + username: ${RABBITMQ_USERNAME:user} + password: ${RABBITMQ_PASSWORD:rabbitmq} + listener: + simple: + acknowledge-mode: AUTO + concurrency: 2 + retry: + enabled: true + max-attempts: 3 + max-interval: 15000 + prefetch: 1 management: endpoint: @@ -17,4 +32,11 @@ management: prometheus.enabled: ${monitoring.enabled:false} health.enabled: true endpoints.web.exposure.include: prometheus, health - metrics.export.prometheus.enabled: ${monitoring.enabled:false} \ No newline at end of file + metrics.export.prometheus.enabled: ${monitoring.enabled:false} + + +storage: + signer-type: 'AWSS3V4SignerType' + bucket-name: 'redaction' + region: 'us-east-1' + endpoint: 'https://s3.amazonaws.com' diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java new file mode 100644 index 00000000..e37034ce --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java @@ -0,0 +1,51 @@ +package com.iqser.red.service.redaction.v1.server; + +import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist; +import com.iqser.red.storage.commons.service.StorageService; +import lombok.SneakyThrows; +import org.apache.commons.io.IOUtils; +import org.springframework.core.io.InputStreamResource; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.util.HashMap; +import java.util.Map; + +public class FileSystemBackedStorageService extends StorageService { + + private final Map dataMap = new HashMap<>(); + + public FileSystemBackedStorageService() { + super(null, null); + } + + @SneakyThrows + @Override + public InputStreamResource getObject(String objectId) { + + var res = dataMap.get(objectId); + if (res == null) { + throw new StorageObjectDoesNotExist(new RuntimeException()); + } + return new InputStreamResource(new FileInputStream(res)); + + } + + @SneakyThrows + @Override + public void storeObject(String objectId, byte[] data) { + File tempFile = File.createTempFile("test", ".tmp"); + + IOUtils.write(data, new FileOutputStream(tempFile)); + + dataMap.put(objectId, tempFile); + } + + public void clearStorage() { + this.dataMap.forEach((k, v) -> { + v.delete(); + }); + this.dataMap.clear(); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index be77a283..1608a982 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -1,30 +1,27 @@ package com.iqser.red.service.redaction.v1.server; -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.when; -import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT; - -import java.io.BufferedReader; -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URL; -import java.nio.charset.StandardCharsets; -import java.time.OffsetDateTime; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.UUID; -import java.util.stream.Collectors; - +import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.configuration.v1.api.model.*; +import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource; +import com.iqser.red.service.file.management.v1.api.model.FileType; +import com.iqser.red.service.redaction.v1.model.*; +import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; +import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; +import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient; +import com.iqser.red.service.redaction.v1.server.client.RulesClient; +import com.iqser.red.service.redaction.v1.server.controller.RedactionController; +import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; +import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService; +import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; +import com.iqser.red.storage.commons.service.StorageService; +import lombok.SneakyThrows; import org.apache.commons.io.IOUtils; +import org.junit.After; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.kie.api.KieServices; @@ -32,48 +29,32 @@ import org.kie.api.builder.KieBuilder; import org.kie.api.builder.KieFileSystem; import org.kie.api.builder.KieModule; import org.kie.api.runtime.KieContainer; +import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.EnableAutoConfiguration; +import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration; import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.boot.test.context.TestConfiguration; import org.springframework.boot.test.mock.mockito.MockBean; import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Import; +import org.springframework.context.annotation.Primary; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit4.SpringRunner; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.service.configuration.v1.api.model.Colors; -import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry; -import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse; -import com.iqser.red.service.configuration.v1.api.model.RulesResponse; -import com.iqser.red.service.configuration.v1.api.model.TypeResponse; -import com.iqser.red.service.configuration.v1.api.model.TypeResult; -import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; -import com.iqser.red.service.redaction.v1.model.AnalyzeResult; -import com.iqser.red.service.redaction.v1.model.AnnotateRequest; -import com.iqser.red.service.redaction.v1.model.AnnotateResponse; -import com.iqser.red.service.redaction.v1.model.Comment; -import com.iqser.red.service.redaction.v1.model.IdRemoval; -import com.iqser.red.service.redaction.v1.model.ManualForceRedact; -import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; -import com.iqser.red.service.redaction.v1.model.ManualRedactions; -import com.iqser.red.service.redaction.v1.model.Point; -import com.iqser.red.service.redaction.v1.model.ReanalyzeResult; -import com.iqser.red.service.redaction.v1.model.Rectangle; -import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; -import com.iqser.red.service.redaction.v1.model.RedactionRequest; -import com.iqser.red.service.redaction.v1.model.RedactionResult; -import com.iqser.red.service.redaction.v1.model.RenalyzeRequest; -import com.iqser.red.service.redaction.v1.model.SectionText; -import com.iqser.red.service.redaction.v1.model.Status; -import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; -import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient; -import com.iqser.red.service.redaction.v1.server.client.RulesClient; -import com.iqser.red.service.redaction.v1.server.controller.RedactionController; -import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; -import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; +import java.io.*; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.time.OffsetDateTime; +import java.util.*; +import java.util.stream.Collectors; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.when; @RunWith(SpringRunner.class) -@SpringBootTest(webEnvironment = RANDOM_PORT) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) +@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class) public class RedactionIntegrationTest { private static final String RULES = loadFromClassPath("drools/rules.drl"); @@ -93,6 +74,7 @@ public class RedactionIntegrationTest { private static final String SIGNATURE = "signature"; private static final String FORMULA = "formula"; private static final String OCR = "ocr"; + private static final String DOSSIER_REDACTIONS = "dossier_redactions"; private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author"; private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address"; @@ -101,9 +83,13 @@ public class RedactionIntegrationTest { private static final String PII = "PII"; + @Autowired private RedactionController redactionController; + @Autowired + private ReanalyzeService reanalyzeService; + @Autowired private ObjectMapper objectMapper; @@ -116,7 +102,20 @@ public class RedactionIntegrationTest { @MockBean private ImageClassificationClient imageClassificationClient; + @Autowired + private RedactionStorageService redactionStorageService; + + @Autowired + private StorageService storageService; + + @MockBean + private AmazonS3 amazonS3; + + @MockBean + private RabbitTemplate rabbitTemplate; + private final Map> dictionary = new HashMap<>(); + private final Map> dossierDictionary = new HashMap<>(); private final Map typeColorMap = new HashMap<>(); private final Map hintTypeMap = new HashMap<>(); private final Map caseInSensitiveMap = new HashMap<>(); @@ -126,8 +125,11 @@ public class RedactionIntegrationTest { private final Map reanlysisVersions = new HashMap<>(); private final static String TEST_RULESET_ID = "123"; + private final static String TEST_PROJECT_ID = "123"; + private final static String TEST_FILE_ID = "123"; - @TestConfiguration + @Configuration + @EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class}) public static class RedactionIntegrationTestConfiguration { @Bean @@ -146,6 +148,21 @@ public class RedactionIntegrationTest { return kieServices.newKieContainer(kieModule.getReleaseId()); } + @Bean + @Primary + public StorageService inmemoryStorage() { + return new FileSystemBackedStorageService(); + } + + + } + + + @After + public void cleanupStorage() { + if (this.storageService instanceof FileSystemBackedStorageService) { + ((FileSystemBackedStorageService) this.storageService).clearStorage(); + } } @@ -158,30 +175,45 @@ public class RedactionIntegrationTest { loadDictionaryForTest(); loadTypeForTest(); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(0L); - when(dictionaryClient.getAllTypes(TEST_RULESET_ID)).thenReturn(TypeResponse.builder() + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(0L); + when(dictionaryClient.getAllTypes(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(TypeResponse.builder() .types(getTypeResponse()) .build()); - when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(VERTEBRATE)); - when(dictionaryClient.getDictionaryForType(ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(ADDRESS)); - when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(AUTHOR)); - when(dictionaryClient.getDictionaryForType(SPONSOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(SPONSOR)); - when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR)); - when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR)); - when(dictionaryClient.getDictionaryForType(HINT_ONLY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(HINT_ONLY)); - when(dictionaryClient.getDictionaryForType(MUST_REDACT, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(MUST_REDACT)); - when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION)); - when(dictionaryClient.getDictionaryForType(TEST_METHOD, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(TEST_METHOD)); - when(dictionaryClient.getDictionaryForType(PII, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PII)); - when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR)); - when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS)); - when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE)); - when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY)); - when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(IMAGE)); - when(dictionaryClient.getDictionaryForType(OCR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(OCR)); - when(dictionaryClient.getDictionaryForType(LOGO, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(LOGO)); - when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(SIGNATURE)); - when(dictionaryClient.getDictionaryForType(FORMULA, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FORMULA)); + + when(dictionaryClient.getVersion(TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(0L); + when(dictionaryClient.getAllTypes(TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(TypeResponse.builder() + .types(List.of(TypeResult.builder() + .type(DOSSIER_REDACTIONS) + .ruleSetId(TEST_RULESET_ID) + .hexColor( "#ffe187") + .isHint(hintTypeMap.get(DOSSIER_REDACTIONS)) + .isCaseInsensitive(caseInSensitiveMap.get(DOSSIER_REDACTIONS)) + .isRecommendation(recommendationTypeMap.get(DOSSIER_REDACTIONS)) + .rank(rankTypeMap.get(DOSSIER_REDACTIONS)) + .build())) + .build()); + + when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false)); + when(dictionaryClient.getDictionaryForType(ADDRESS, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(ADDRESS, false)); + when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(AUTHOR, false)); + when(dictionaryClient.getDictionaryForType(SPONSOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SPONSOR, false)); + when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR, false)); + when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR, false)); + when(dictionaryClient.getDictionaryForType(HINT_ONLY, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(HINT_ONLY, false)); + when(dictionaryClient.getDictionaryForType(MUST_REDACT, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(MUST_REDACT, false)); + when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION, false)); + when(dictionaryClient.getDictionaryForType(TEST_METHOD, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(TEST_METHOD, false)); + when(dictionaryClient.getDictionaryForType(PII, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PII, false)); + when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR, false)); + when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS, false)); + when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FALSE_POSITIVE, false)); + when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PURITY, false)); + when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(IMAGE, false)); + when(dictionaryClient.getDictionaryForType(OCR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(OCR, false)); + when(dictionaryClient.getDictionaryForType(LOGO, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(LOGO, false)); + when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SIGNATURE, false)); + when(dictionaryClient.getDictionaryForType(FORMULA, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FORMULA, false)); + when(dictionaryClient.getDictionaryForType(DOSSIER_REDACTIONS, TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(getDictionaryResponse(DOSSIER_REDACTIONS, true)); when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors); } @@ -288,6 +320,11 @@ public class RedactionIntegrationTest { .stream() .map(this::cleanDictionaryEntry) .collect(Collectors.toSet())); + dossierDictionary.computeIfAbsent(DOSSIER_REDACTIONS, v -> new ArrayList<>()) + .addAll(ResourceLoader.load("dictionaries/dossier_redactions.txt") + .stream() + .map(this::cleanDictionaryEntry) + .collect(Collectors.toSet())); } @@ -340,6 +377,7 @@ public class RedactionIntegrationTest { hintTypeMap.put(FORMULA, false); hintTypeMap.put(LOGO, false); hintTypeMap.put(SIGNATURE, false); + hintTypeMap.put(DOSSIER_REDACTIONS, false); caseInSensitiveMap.put(VERTEBRATE, true); caseInSensitiveMap.put(ADDRESS, false); @@ -361,6 +399,7 @@ public class RedactionIntegrationTest { caseInSensitiveMap.put(SIGNATURE, true); caseInSensitiveMap.put(LOGO, true); caseInSensitiveMap.put(FORMULA, true); + caseInSensitiveMap.put(DOSSIER_REDACTIONS, false); recommendationTypeMap.put(VERTEBRATE, false); recommendationTypeMap.put(ADDRESS, false); @@ -382,6 +421,7 @@ public class RedactionIntegrationTest { recommendationTypeMap.put(FORMULA, false); recommendationTypeMap.put(SIGNATURE, false); recommendationTypeMap.put(LOGO, false); + recommendationTypeMap.put(DOSSIER_REDACTIONS, false); rankTypeMap.put(FALSE_POSITIVE, 160); rankTypeMap.put(PURITY, 155); @@ -403,6 +443,7 @@ public class RedactionIntegrationTest { rankTypeMap.put(LOGO, 28); rankTypeMap.put(SIGNATURE, 27); rankTypeMap.put(FORMULA, 26); + rankTypeMap.put(DOSSIER_REDACTIONS, 200); colors.setDefaultColor("#acfc00"); colors.setNotRedacted("#cccccc"); @@ -429,11 +470,11 @@ public class RedactionIntegrationTest { } - private DictionaryResponse getDictionaryResponse(String type) { + private DictionaryResponse getDictionaryResponse(String type, boolean isDossierDictionary) { return DictionaryResponse.builder() .hexColor(typeColorMap.get(type)) - .entries(toDictionaryEntry(dictionary.get(type))) + .entries(isDossierDictionary ? toDictionaryEntry(dossierDictionary.get(type)) : toDictionaryEntry(dictionary.get(type))) .isHint(hintTypeMap.get(type)) .isCaseInsensitive(caseInSensitiveMap.get(type)) .isRecommendation(recommendationTypeMap.get(type)) @@ -453,6 +494,71 @@ public class RedactionIntegrationTest { @Test + public void test270Rotated() { + AnalyzeRequest request = prepareStorage("files/Minimal Examples/270Rotated.pdf"); + MemoryStats.printMemoryStats(); + AnalyzeResult result = reanalyzeService.analyze(request); + assertThat(result).isNotNull(); + } + + + @Test + @Ignore + public void testLargeScannedFileOOM() { + AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf"); + MemoryStats.printMemoryStats(); + AnalyzeResult result = reanalyzeService.analyze(request); + assertThat(result).isNotNull(); + } + + @Test + public void testMergedImages() throws IOException { + + long start = System.currentTimeMillis(); + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/merge_images.pdf"); + + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + AnalyzeResult result = reanalyzeService.analyze(request); + + Map> duplicates = new HashMap<>(); + + var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID); + + redactionLog.getRedactionLogEntry().forEach(entry -> { + duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry); + }); + + duplicates.entrySet().forEach(entry -> { + assertThat(entry.getValue().size()).isEqualTo(1); + }); + + dictionary.get(AUTHOR).add("Drinking water"); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(1L); + + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() + .projectId(TEST_PROJECT_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated3.pdf")) { + fileOutputStream.write(annotateResponse.getDocument()); + } + long rstart = System.currentTimeMillis(); + reanalyzeService.reanalyze(request); + + long rend = System.currentTimeMillis(); + System.out.println("reanalysis analysis duration: " + (rend - rstart)); + + + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + + + } + + @Test + @Ignore public void noExceptionShouldBeThrownForAnyFiles() throws IOException { long start = System.currentTimeMillis(); @@ -465,15 +571,16 @@ public class RedactionIntegrationTest { input.addAll(getPathsRecursively(file)); } for (File path : input) { - AnalyzeRequest request = AnalyzeRequest.builder() - .ruleSetId(TEST_RULESET_ID) - .document(IOUtils.toByteArray(new FileInputStream(path))) - .build(); + + AnalyzeRequest request = prepareStorage(new FileInputStream((path))); System.out.println("Redacting file : " + path.getName()); - AnalyzeResult result = redactionController.analyze(request); + AnalyzeResult result = reanalyzeService.analyze(request); Map> duplicates = new HashMap<>(); - result.getRedactionLog().getRedactionLogEntry().forEach(entry -> { + + var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID); + + redactionLog.getRedactionLogEntry().forEach(entry -> { duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry); }); @@ -482,16 +589,10 @@ public class RedactionIntegrationTest { }); dictionary.get(AUTHOR).add("Drinking water"); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(1L); long rstart = System.currentTimeMillis(); - ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder() - .redactionLog(result.getRedactionLog()) - .document(IOUtils.toByteArray(new FileInputStream(path))) - .manualRedactions(null) - .text(result.getText()) - .ruleSetId(TEST_RULESET_ID) - .build()); + reanalyzeService.reanalyze(request); long rend = System.currentTimeMillis(); System.out.println("reanalysis analysis duration: " + (rend - rstart)); @@ -526,18 +627,16 @@ public class RedactionIntegrationTest { @Test public void redactionTest() throws IOException { - System.out.println("redactionTest"); long start = System.currentTimeMillis(); ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf"); + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); - AnalyzeRequest request = AnalyzeRequest.builder() - .ruleSetId(TEST_RULESET_ID) - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .build(); + AnalyzeResult result = reanalyzeService.analyze(request); - AnalyzeResult result = redactionController.analyze(request); + var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID); + var text = redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID); - result.getRedactionLog().getRedactionLogEntry().forEach(entry -> { + redactionLog.getRedactionLogEntry().forEach(entry -> { if (entry.isImage()) { System.out.println("---->" + entry.getType()); } @@ -548,13 +647,13 @@ public class RedactionIntegrationTest { System.out.println("first analysis duration: " + (end - start)); try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) { - fileOutputStream.write(objectMapper.writeValueAsBytes(result.getText())); + fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID))); } int correctFound = 0; loop: - for (RedactionLogEntry redactionLogEntry : result.getRedactionLog().getRedactionLogEntry()) { - for (SectionText sectionText : result.getText().getSectionTexts()) { + for (RedactionLogEntry redactionLogEntry : redactionLog.getRedactionLogEntry()) { + for (SectionText sectionText : text.getSectionTexts()) { if (redactionLogEntry.isImage()) { correctFound++; continue loop; @@ -570,7 +669,7 @@ public class RedactionIntegrationTest { } } } - assertThat(correctFound).isEqualTo(result.getRedactionLog().getRedactionLogEntry().size()); + assertThat(correctFound).isEqualTo(redactionLog.getRedactionLogEntry().size()); dictionary.get(AUTHOR).add("properties"); reanlysisVersions.put("properties", 1L); @@ -581,25 +680,19 @@ public class RedactionIntegrationTest { dictionary.get(VERTEBRATE).add("s-metolachlor"); reanlysisVersions.put("s-metolachlor", 3L); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(3L); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(3L); - when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(VERTEBRATE)); + when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false)); start = System.currentTimeMillis(); - ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder() - .redactionLog(result.getRedactionLog()) - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .text(result.getText()) - .ruleSetId(TEST_RULESET_ID) - .build()); + AnalyzeResult reanalyzeResult = reanalyzeService.reanalyze(request); end = System.currentTimeMillis(); System.out.println("reanalysis analysis duration: " + (end - start)); AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .redactionLog(reanalyzeResult.getRedactionLog()) - .sectionGrid(result.getSectionGrid()) + .projectId(TEST_PROJECT_ID) + .fileId(TEST_FILE_ID) .build()); try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) { @@ -614,19 +707,13 @@ public class RedactionIntegrationTest { System.out.println("testTableRedaction"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); - AnalyzeRequest request = AnalyzeRequest.builder() - .ruleSetId(TEST_RULESET_ID) - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .build(); - - AnalyzeResult result = redactionController.analyze(request); + AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); + AnalyzeResult result = reanalyzeService.analyze(request); AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .redactionLog(result.getRedactionLog()) - .sectionGrid(result.getSectionGrid()) + .projectId(TEST_PROJECT_ID) + .fileId(TEST_FILE_ID) .build()); try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) { @@ -681,13 +768,10 @@ public class RedactionIntegrationTest { // manualRedactions.getEntriesToAdd().add(manualRedactionEntry); - AnalyzeRequest request = AnalyzeRequest.builder() - .ruleSetId(TEST_RULESET_ID) - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .manualRedactions(manualRedactions) - .build(); - AnalyzeResult result = redactionController.analyze(request); + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + request.setManualRedactions(manualRedactions); + AnalyzeResult result = reanalyzeService.analyze(request); manualRedactions.getEntriesToAdd().add(manualRedactionEntry); manualRedactions.setIdsToRemove(Set.of(IdRemoval.builder() @@ -695,20 +779,15 @@ public class RedactionIntegrationTest { .status(Status.APPROVED) .build())); - ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder() - .redactionLog(result.getRedactionLog()) - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .manualRedactions(manualRedactions) - .text(result.getText()) - .ruleSetId(TEST_RULESET_ID) - .build()); + reanalyzeService.reanalyze(request); + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .redactionLog(reanalyzeResult.getRedactionLog()) - .sectionGrid(result.getSectionGrid()) + .projectId(TEST_PROJECT_ID) + .fileId(TEST_FILE_ID) .build()); + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) { fileOutputStream.write(annotateResponse.getDocument()); } @@ -725,11 +804,16 @@ public class RedactionIntegrationTest { System.out.println("classificationTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf"); - RedactionRequest request = RedactionRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + + RedactionRequest redactionRequest = RedactionRequest.builder() + .projectId(request.getProjectId()) + .fileId(request.getFileId()) + .ruleSetId(request.getRuleSetId()) .build(); - RedactionResult result = redactionController.classify(request); + RedactionResult result = redactionController.classify(redactionRequest); try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Classified.pdf")) { fileOutputStream.write(result.getDocument()); @@ -743,11 +827,15 @@ public class RedactionIntegrationTest { System.out.println("sectionsTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); - RedactionRequest request = RedactionRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + + RedactionRequest redactionRequest = RedactionRequest.builder() + .projectId(request.getProjectId()) + .fileId(request.getFileId()) + .ruleSetId(request.getRuleSetId()) .build(); - RedactionResult result = redactionController.sections(request); + RedactionResult result = redactionController.sections(redactionRequest); try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Sections.pdf")) { fileOutputStream.write(result.getDocument()); @@ -761,11 +849,15 @@ public class RedactionIntegrationTest { System.out.println("htmlTablesTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); - RedactionRequest request = RedactionRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + + RedactionRequest redactionRequest = RedactionRequest.builder() + .projectId(request.getProjectId()) + .fileId(request.getFileId()) + .ruleSetId(request.getRuleSetId()) .build(); - RedactionResult result = redactionController.htmlTables(request); + RedactionResult result = redactionController.htmlTables(redactionRequest); try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) { fileOutputStream.write(result.getDocument()); @@ -779,11 +871,15 @@ public class RedactionIntegrationTest { System.out.println("htmlTableRotationTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); - RedactionRequest request = RedactionRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + + RedactionRequest redactionRequest = RedactionRequest.builder() + .projectId(request.getProjectId()) + .fileId(request.getFileId()) + .ruleSetId(request.getRuleSetId()) .build(); - RedactionResult result = redactionController.htmlTables(request); + RedactionResult result = redactionController.htmlTables(redactionRequest); try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) { fileOutputStream.write(result.getDocument()); @@ -796,20 +892,45 @@ public class RedactionIntegrationTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Phantom Cells.pdf"); - AnalyzeRequest request = AnalyzeRequest.builder() - .ruleSetId(TEST_RULESET_ID) - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .build(); + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); - AnalyzeResult result = redactionController.analyze(request); + AnalyzeResult result = reanalyzeService.analyze(request); - result.getRedactionLog().getRedactionLogEntry().forEach(entry -> { + var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID); + + redactionLog.getRedactionLogEntry().forEach(entry -> { if (!entry.isHint()) { assertThat(entry.getReason()).isEqualTo("Not redacted because row is not a vertebrate study"); } }); } + @SneakyThrows + private AnalyzeRequest prepareStorage(String file) { + ClassPathResource pdfFileResource = new ClassPathResource(file); + + return prepareStorage(pdfFileResource.getInputStream()); + } + + + @SneakyThrows + private AnalyzeRequest prepareStorage(InputStream stream) { + + AnalyzeRequest request = AnalyzeRequest.builder() + .ruleSetId(TEST_RULESET_ID) + .projectId(TEST_PROJECT_ID) + .fileId(TEST_FILE_ID) + .lastProcessed(OffsetDateTime.now()) + .build(); + + var bytes = IOUtils.toByteArray(stream); + + storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_PROJECT_ID, TEST_FILE_ID, FileType.ORIGIN), bytes); + + return request; + + } + @Test public void sponsorCompanyTest() throws IOException { @@ -817,17 +938,14 @@ public class RedactionIntegrationTest { long start = System.currentTimeMillis(); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/sponsor_companies.pdf"); - AnalyzeRequest request = AnalyzeRequest.builder() - .ruleSetId(TEST_RULESET_ID) - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .build(); - AnalyzeResult result = redactionController.analyze(request); + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + + AnalyzeResult result = reanalyzeService.analyze(request); AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .redactionLog(result.getRedactionLog()) - .sectionGrid(result.getSectionGrid()) + .projectId(TEST_PROJECT_ID) + .fileId(TEST_FILE_ID) .build()); try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) { @@ -858,4 +976,4 @@ public class RedactionIntegrationTest { } } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index b7efed93..b6cb00ca 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -1,12 +1,10 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import com.iqser.red.service.configuration.v1.api.model.Colors; -import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry; -import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse; -import com.iqser.red.service.configuration.v1.api.model.RulesResponse; -import com.iqser.red.service.configuration.v1.api.model.TypeResponse; -import com.iqser.red.service.configuration.v1.api.model.TypeResult; -import com.iqser.red.service.redaction.v1.model.RedactionRequest; +import com.amazonaws.services.s3.AmazonS3; +import com.iqser.red.service.configuration.v1.api.model.*; +import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource; +import com.iqser.red.service.redaction.v1.server.Application; +import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient; @@ -14,8 +12,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; -import org.apache.commons.io.IOUtils; -import org.apache.pdfbox.pdmodel.PDDocument; +import com.iqser.red.storage.commons.service.StorageService; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; @@ -26,10 +23,14 @@ import org.kie.api.builder.KieFileSystem; import org.kie.api.builder.KieModule; import org.kie.api.runtime.KieContainer; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.EnableAutoConfiguration; +import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration; import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.boot.test.context.TestConfiguration; import org.springframework.boot.test.mock.mockito.MockBean; import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Import; +import org.springframework.context.annotation.Primary; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit4.SpringRunner; @@ -40,21 +41,15 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; import java.util.concurrent.atomic.AtomicLong; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.when; -@SpringBootTest @RunWith(SpringRunner.class) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) +@Import(EntityRedactionServiceTest.RedactionIntegrationTestConfiguration.class) public class EntityRedactionServiceTest { private static final String DEFAULT_RULES = loadFromClassPath("drools/rules.drl"); @@ -80,9 +75,13 @@ public class EntityRedactionServiceTest { @Autowired private DroolsExecutionService droolsExecutionService; + @MockBean + private AmazonS3 amazonS3; + private final static String TEST_RULESET_ID = "123"; - @TestConfiguration + @Configuration + @EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class}) public static class RedactionIntegrationTestConfiguration { @Bean @@ -101,6 +100,13 @@ public class EntityRedactionServiceTest { return kieServices.newKieContainer(kieModule.getReleaseId()); } + + @Bean + @Primary + public StorageService inmemoryStorage() { + return new FileSystemBackedStorageService(); + } + } @@ -108,8 +114,8 @@ public class EntityRedactionServiceTest { public void testNestedEntitiesRemoval() { Set entities = new HashSet<>(); - Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false); - Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false); + Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false, false); + Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false, false); entities.add(nested); entities.add(nesting); EntitySearchUtils.removeEntitiesContainedInLarger(entities); @@ -125,31 +131,25 @@ public class EntityRedactionServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf"); - RedactionRequest redactionRequest = RedactionRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .build(); - DictionaryResponse dictionaryResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))) .build(); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))) .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() .entries(Collections.emptyList()) .build(); - when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); + when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities } @@ -158,30 +158,24 @@ public class EntityRedactionServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/nested_redaction.pdf"); - RedactionRequest redactionRequest = RedactionRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .build(); - DictionaryResponse dictionaryResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))) .build(); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))) .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() .entries(Collections.emptyList()) .build(); - when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); + when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities } @@ -190,64 +184,58 @@ public class EntityRedactionServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Cyprodinil/40 Cyprodinil - EU AIR3 - LCA Section 1" + " Supplement - Identity of the active substance - Reference list.pdf"); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt")))) .build(); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt")))) .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() .entries(Collections.emptyList()) .build(); - when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities() - .entrySet() - .stream() - .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); - } + when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse); + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities() + .entrySet() + .stream() + .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " + "the plant protection product.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities() - .entrySet() - .stream() - .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); - } + classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities() + .entrySet() + .stream() + .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); } @Test public void testFalsePositiveInWrongCell() throws IOException { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf"); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt")))) .build(); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt")))) .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt")))) .build(); - when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream() - .filter(entity -> entity.getMatchedRule() == 9) - .count()).isEqualTo(10); - } + when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse); + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 9) + .count()).isEqualTo(10); } @@ -296,27 +284,25 @@ public class EntityRedactionServiceTest { droolsExecutionService.updateRules(TEST_RULESET_ID); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf"); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt")))) .build(); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt")))) .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() .entries(Collections.emptyList()) .build(); - when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream() - .filter(entity -> entity.getMatchedRule() == 6) - .count()).isEqualTo(13); - } + when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse); + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 6) + .count()).isEqualTo(13); } @@ -337,27 +323,25 @@ public class EntityRedactionServiceTest { droolsExecutionService.updateRules(TEST_RULESET_ID); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/batches_new_line.pdf"); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(Collections.emptyList()) .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); DictionaryResponse authorResponse = DictionaryResponse.builder() .entries(Collections.emptyList()) .build(); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(authorResponse); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(authorResponse); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt")))) .build(); - when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream() - .filter(entity -> entity.getMatchedRule() == 11) - .count()).isEqualTo(1); - } + when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 11) + .count()).isEqualTo(1); } @@ -371,24 +355,22 @@ public class EntityRedactionServiceTest { .entries(toDictionaryEntry(Arrays.asList("Bissig R.", "Thanei P."))) .build(); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))) .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() .entries(Collections.emptyList()) .build(); - when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages - assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8); - assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y - } + when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse); + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8); + assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf"); @@ -396,20 +378,18 @@ public class EntityRedactionServiceTest { .entries(toDictionaryEntry(Arrays.asList("Tribolet, R.", "Muir, G.", "Kühne-Thu, H.", "Close, C."))) .build(); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); addressResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))) .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3); - assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9); - } + classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3); + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9); } @@ -423,23 +403,21 @@ public class EntityRedactionServiceTest { .entries(toDictionaryEntry(Collections.singletonList("Aldershof S."))) .build(); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))) .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() .entries(Collections.emptyList()) .build(); - when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6); - } + when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse); + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId"); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6); } @@ -476,19 +454,19 @@ public class EntityRedactionServiceTest { TypeResult.builder().ruleSetId(TEST_RULESET_ID).type(ADDRESS_CODE).hexColor("#ff00ff").build(), TypeResult.builder().ruleSetId(TEST_RULESET_ID).type(SPONSOR_CODE).hexColor("#00ffff").build())) .build(); - when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); - when(dictionaryClient.getAllTypes(TEST_RULESET_ID)).thenReturn(typeResponse); + when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getAllTypes(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(typeResponse); // Default empty return to prevent NPEs DictionaryResponse dictionaryResponse = DictionaryResponse.builder() .build(); - when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); + when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() .build(); - when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); + when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse); Colors colors = new Colors(); colors.setDefaultColor("#acfc00"); @@ -518,7 +496,7 @@ public class EntityRedactionServiceTest { } } - private List toDictionaryEntry(List entries){ + private List toDictionaryEntry(List entries) { List dictionaryEntries = new ArrayList<>(); entries.forEach(entry -> { dictionaryEntries.add(new DictionaryEntry(entry, 1L, false)); @@ -526,4 +504,4 @@ public class EntityRedactionServiceTest { return dictionaryEntries; } -} \ No newline at end of file +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java index 4f58b26d..44842b7d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java @@ -1,7 +1,31 @@ package com.iqser.red.service.redaction.v1.server.segmentation; -import static org.assertj.core.api.Assertions.assertThat; +import com.amazonaws.services.s3.AmazonS3; +import com.iqser.red.service.redaction.v1.server.Application; +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.Page; +import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; +import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; +import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; +import org.junit.Ignore; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.kie.api.runtime.KieContainer; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.EnableAutoConfiguration; +import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Import; +import org.springframework.core.io.ClassPathResource; +import org.springframework.test.context.junit4.SpringRunner; +import javax.imageio.ImageIO; import java.io.ByteArrayOutputStream; import java.io.FileOutputStream; import java.io.IOException; @@ -9,31 +33,12 @@ import java.util.Collections; import java.util.List; import java.util.stream.Collectors; -import javax.imageio.ImageIO; +import static org.assertj.core.api.Assertions.assertThat; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.junit.Ignore; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.kie.api.runtime.KieContainer; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.boot.test.mock.mockito.MockBean; -import org.springframework.core.io.ClassPathResource; -import org.springframework.test.context.junit4.SpringRunner; -import com.iqser.red.service.redaction.v1.server.classification.model.Document; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; -import com.iqser.red.service.redaction.v1.server.redaction.model.Image; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; -import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; -import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; - -@SpringBootTest @RunWith(SpringRunner.class) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) +@Import(PdfSegmentationServiceTest.TestConfiguration.class) public class PdfSegmentationServiceTest { @Autowired @@ -51,6 +56,28 @@ public class PdfSegmentationServiceTest { @MockBean private KieContainer kieContainer; + @MockBean + private AmazonS3 amazonS3; + + @MockBean + private RabbitTemplate rabbitTemplate; + + @Configuration + @EnableAutoConfiguration(exclude = { RabbitAutoConfiguration.class}) + public static class TestConfiguration { + + } + + @Test + public void testMergeImages() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/270Rotated.pdf"); + + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getPages().get(0).getImages().size()).isEqualTo(1); + assertThat(document.getPages().get(1).getImages().size()).isEqualTo(0); + + } @Test @Ignore @@ -58,61 +85,78 @@ public class PdfSegmentationServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document document = pdfSegmentationService.parseDocument(pdDocument); - int i = 0; - for (Page page : document.getPages()) { - for (PdfImage image : page.getImages()) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - ImageIO.write(image.getImage(), "png", baos); - try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) { - fileOutputStream.write(baos.toByteArray()); - } + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + int i = 0; + for (Page page : document.getPages()) { + for (PdfImage image : page.getImages()) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + ImageIO.write(image.getImage(), "png", baos); + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) { + fileOutputStream.write(baos.toByteArray()); } - i++; } + i++; } } } + @Test + public void testPDFSegmentationWithComplexTable() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); + + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table table = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(table.getColCount()).isEqualTo(6); + assertThat(table.getRowCount()).isEqualTo(13); + assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); + } + + @Test public void testTableExtraction() throws IOException { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document document = pdfSegmentationService.parseDocument(pdDocument); - assertThat(document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList())).isNotEmpty(); - Table firstTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(0); - assertThat(firstTable.getColCount()).isEqualTo(8); - assertThat(firstTable.getRowCount()).isEqualTo(1); - Table secondTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(1); - assertThat(secondTable.getColCount()).isEqualTo(8); - assertThat(secondTable.getRowCount()).isEqualTo(2); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) - .stream() - .map(Collections::singletonList) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()) - .equals(firstTableHeaderCells))).isTrue(); - } + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table firstTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(firstTable.getColCount()).isEqualTo(8); + assertThat(firstTable.getRowCount()).isEqualTo(1); + Table secondTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(1); + assertThat(secondTable.getColCount()).isEqualTo(8); + assertThat(secondTable.getRowCount()).isEqualTo(2); + List> firstTableHeaderCells = firstTable.getRows() + .get(0) + .stream() + .map(Collections::singletonList) + .collect(Collectors.toList()); + assertThat(secondTable.getRows() + .stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()) + .equals(firstTableHeaderCells))).isTrue(); } @@ -121,38 +165,36 @@ public class PdfSegmentationServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document document = pdfSegmentationService.parseDocument(pdDocument); - assertThat(document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList())).isNotEmpty(); - Table firstTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(0); - assertThat(firstTable.getColCount()).isEqualTo(9); - assertThat(firstTable.getRowCount()).isEqualTo(5); - Table secondTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(1); - assertThat(secondTable.getColCount()).isEqualTo(9); - assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(firstTable.getRowCount() - 1) - .stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()) - .equals(firstTableHeaderCells))).isTrue(); - } + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table firstTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(firstTable.getColCount()).isEqualTo(9); + assertThat(firstTable.getRowCount()).isEqualTo(5); + Table secondTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(1); + assertThat(secondTable.getColCount()).isEqualTo(9); + assertThat(secondTable.getRowCount()).isEqualTo(6); + List> firstTableHeaderCells = firstTable.getRows() + .get(firstTable.getRowCount() - 1) + .stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()); + assertThat(secondTable.getRows() + .stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()) + .equals(firstTableHeaderCells))).isTrue(); } @@ -161,38 +203,36 @@ public class PdfSegmentationServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document document = pdfSegmentationService.parseDocument(pdDocument); - assertThat(document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList())).isNotEmpty(); - Table firstTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(0); - assertThat(firstTable.getColCount()).isEqualTo(8); - assertThat(firstTable.getRowCount()).isEqualTo(1); - Table secondTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(1); - assertThat(secondTable.getColCount()).isEqualTo(8); - assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) - .stream() - .map(Collections::singletonList) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()) - .equals(firstTableHeaderCells))).isTrue(); - } + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table firstTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(firstTable.getColCount()).isEqualTo(8); + assertThat(firstTable.getRowCount()).isEqualTo(1); + Table secondTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(1); + assertThat(secondTable.getColCount()).isEqualTo(8); + assertThat(secondTable.getRowCount()).isEqualTo(6); + List> firstTableHeaderCells = firstTable.getRows() + .get(0) + .stream() + .map(Collections::singletonList) + .collect(Collectors.toList()); + assertThat(secondTable.getRows() + .stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()) + .equals(firstTableHeaderCells))).isTrue(); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/application.yml b/redaction-service-v1/redaction-service-server-v1/src/test/resources/application.yml index 23e59464..4b511179 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/application.yml +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/application.yml @@ -1,5 +1,6 @@ configuration-service.url: "http://configuration-service-v1:8080" image-service.url: "http://image-service-v1:8080" +file-management-service.url: "http://file-management-service-v1:8080" ribbon: ConnectTimeout: 600000 diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/dossier_redactions.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/dossier_redactions.txt new file mode 100644 index 00000000..3840a8ac --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/dossier_redactions.txt @@ -0,0 +1 @@ +Difenoconazole \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index a2a78200..5f7e24f2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -328,4 +328,12 @@ rule "28: Redact Logos" Section(matchesImageType("logo")) then section.redactImage("logo", 28, "Logo found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - end \ No newline at end of file + end + + +rule "29: Redact Dossier Redactions" + when + Section(matchesType("dossier_redactions")) + then + section.redact("dossier_redactions", 29, "Dossier Redaction found", "Article 39(1)(2) of Regulation (EC) No 178/2002"); + end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/merge_images.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/merge_images.pdf new file mode 100644 index 00000000..a2decc1a Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/merge_images.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-377031.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-377031.pdf new file mode 100644 index 00000000..0fe661c4 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-377031.pdf differ