Merge branch 'master' of ssh://git.iqser.com:2222/red/redaction-service into Test

 Conflicts:
	redaction-service-v1/redaction-service-server-v1/pom.xml
This commit is contained in:
aoezyetimoglu 2021-05-06 15:04:02 +02:00
commit a6f8ea0f92
103 changed files with 2715 additions and 1891 deletions

View File

@ -1,7 +1,5 @@
package buildjob;
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
@ -24,6 +22,8 @@ import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.util.BambooServer;
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
/**
* Plan configuration for Bamboo.
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
@ -33,6 +33,8 @@ public class PlanSpec {
private static final String SERVICE_NAME = "redaction-service";
private static final String JVM_ARGS =" -Xmx4g -XX:+ExitOnOutOfMemoryError -XX:SurvivorRatio=2 -XX:NewRatio=1 -XX:InitialTenuringThreshold=16 -XX:MaxTenuringThreshold=16 -XX:InitiatingHeapOccupancyPercent=35 ";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", "");
/**
@ -82,9 +84,12 @@ public class PlanSpec {
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build")
.environmentVariables("MAVEN_OPTS="+JVM_ARGS)
.inlineBody("#!/bin/bash\n" +
"set -e\n" +
"export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\"\n" +
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +

View File

@ -5,7 +5,7 @@
<parent>
<artifactId>platform-dependency</artifactId>
<groupId>com.iqser.red</groupId>
<version>1.0.8</version>
<version>1.1.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -32,7 +32,7 @@
<dependency>
<groupId>com.iqser.red</groupId>
<artifactId>platform-commons-dependency</artifactId>
<version>1.2.5</version>
<version>1.3.1</version>
<scope>import</scope>
<type>pom</type>
</dependency>
@ -52,4 +52,4 @@
</dependencyManagement>
</project>
</project>

View File

@ -5,13 +5,20 @@ import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class AnalyzeRequest {
private byte[] document;
private String projectId;
private String fileId;
private String ruleSetId;
private boolean reanalyseOnlyIfPossible;
private ManualRedactions manualRedactions;
private OffsetDateTime lastProcessed;
}

View File

@ -11,9 +11,19 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class AnalyzeResult {
private String projectId;
private String fileId;
private long duration;
private int numberOfPages;
private RedactionLog redactionLog;
private SectionGrid sectionGrid;
private Text text;
private boolean hasHints;
private boolean hasRequests;
private boolean hasRedactions;
private boolean hasImages;
private boolean hasUpdates;
private long dictionaryVersion;
private long dossierDictionaryVersion;
private long rulesVersion;
}

View File

@ -11,7 +11,6 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class AnnotateRequest {
private byte[] document;
private RedactionLog redactionLog;
private SectionGrid sectionGrid;
private String projectId;
private String fileId;
}

View File

@ -0,0 +1,5 @@
package com.iqser.red.service.redaction.v1.model;
public enum ChangeType {
ADDED, REMOVED
}

View File

@ -1,12 +1,12 @@
package com.iqser.red.service.redaction.v1.model;
import java.time.OffsetDateTime;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
@Data
@Builder
@AllArgsConstructor

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@Builder
@AllArgsConstructor
@ -27,4 +27,6 @@ public class ManualRedactionEntry {
private String section;
private int sectionNumber;
private boolean addToDossierDictionary;
}

View File

@ -1,16 +1,16 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor

View File

@ -1,25 +1,22 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SectionText {
@NoArgsConstructor
public class RedactionChangeLog {
private int sectionNumber;
private String text;
private List<RedactionChangeLogEntry> redactionLogEntry = new ArrayList<>();
private boolean isTable;
private String headline;
private long dictionaryVersion = -1;
private long rulesVersion = -1;
private List<SectionArea> sectionAreas = new ArrayList<>();
private String ruleSetId;
}

View File

@ -0,0 +1,47 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class RedactionChangeLogEntry {
private String id;
private String type;
private String value;
private String reason;
private int matchedRule;
private String legalBasis;
private boolean redacted;
private boolean isHint;
private boolean isRecommendation;
private String section;
private float[] color;
@Builder.Default
private List<Rectangle> positions = new ArrayList<>();
private int sectionNumber;
private boolean manual;
private Status status;
private ManualRedactionType manualRedactionType;
private boolean isDictionaryEntry;
private String textBefore;
private String textAfter;
@Builder.Default
private List<Comment> comments = new ArrayList<>();
private ChangeType changeType;
private boolean isDossierDictionaryEntry;
}

View File

@ -1,13 +1,11 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.List;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class RedactionLog {
@ -17,15 +15,17 @@ public class RedactionLog {
private long rulesVersion = -1;
private String ruleSetId;
private String filename;
private long dossierDictionaryVersion = -1;
public RedactionLog(List<RedactionLogEntry> redactionLogEntry, long dictionaryVersion, long rulesVersion, String ruleSetId) {
public RedactionLog(List<RedactionLogEntry> redactionLogEntry, long dictionaryVersion, long rulesVersion, String ruleSetId, long dossierDictionaryVersion) {
this.redactionLogEntry = redactionLogEntry;
this.dictionaryVersion = dictionaryVersion;
this.rulesVersion = rulesVersion;
this.ruleSetId = ruleSetId;
this.dossierDictionaryVersion = dossierDictionaryVersion;
}

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@Builder
@NoArgsConstructor
@ -45,4 +45,6 @@ public class RedactionLogEntry {
private boolean isImage;
private boolean isDossierDictionaryEntry;
}

View File

@ -11,7 +11,8 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class RedactionRequest {
private byte[] document;
private String projectId;
private String fileId;
private String ruleSetId;
private ManualRedactions manualRedactions;
}

View File

@ -13,7 +13,5 @@ public class RedactionResult {
private byte[] document;
private int numberOfPages;
private RedactionLog redactionLog;
private SectionGrid sectionGrid;
}

View File

@ -1,22 +0,0 @@
package com.iqser.red.service.redaction.v1.model;
import java.time.OffsetDateTime;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class RenalyzeRequest {
private byte[] document;
private String ruleSetId;
private ManualRedactions manualRedactions;
private Text text;
private RedactionLog redactionLog;
private OffsetDateTime lastProcessed;
}

View File

@ -27,7 +27,7 @@ public class SectionArea {
private String header;
public boolean contains(Rectangle other) {
return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight();
return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight();
}
}

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Data
@AllArgsConstructor
@NoArgsConstructor

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import java.util.List;
@Data
@AllArgsConstructor
@NoArgsConstructor

View File

@ -1,14 +1,6 @@
package com.iqser.red.service.redaction.v1.resources;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.*;
import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
@ -21,11 +13,6 @@ public interface RedactionResource {
String RULE_SET_PARAMETER_NAME = "ruleSetId";
String RULE_SET_PATH_VARIABLE = "/{" + RULE_SET_PARAMETER_NAME + "}";
@PostMapping(value = "/analyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest);
@PostMapping(value = "/reanalyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest);
@PostMapping(value = "/annotate", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest);
@ -39,10 +26,10 @@ public interface RedactionResource {
@PostMapping(value = "/debug/htmlTables", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest);
@PostMapping(value = "/rules/update"+RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE)
@PostMapping(value = "/rules/update" + RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE)
void updateRules(@PathVariable(RULE_SET_PARAMETER_NAME) String ruleSetId);
@PostMapping(value = "/rules/test", consumes = MediaType.APPLICATION_JSON_VALUE)
void testRules(@RequestBody String rules);
}
}

View File

@ -12,6 +12,10 @@
<artifactId>redaction-service-server-v1</artifactId>
<dependencies>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>storage-commons</artifactId>
</dependency>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>redaction-service-api-v1</artifactId>
@ -20,7 +24,18 @@
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>configuration-service-api-v1</artifactId>
<version>2.2.9</version>
<version>2.5.6</version>
</dependency>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>file-management-service-api-v1</artifactId>
<version>2.7.4</version>
<exclusions>
<exclusion>
<groupId>com.iqser.red.service</groupId>
<artifactId>redaction-service-api-v1</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.drools</groupId>
@ -74,6 +89,12 @@
<artifactId>spring-cloud-starter-openfeign</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-amqp</artifactId>
<version>2.3.1.RELEASE</version>
</dependency>
<!-- test dependencies -->
<dependency>
<groupId>org.springframework.boot</groupId>
@ -86,9 +107,9 @@
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<groupId>org.springframework.amqp</groupId>
<artifactId>spring-rabbit-test</artifactId>
<version>2.3.1</version>
<scope>test</scope>
</dependency>
</dependencies>

View File

@ -1,5 +1,8 @@
package com.iqser.red.service.redaction.v1.server;
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@ -8,10 +11,6 @@ import org.springframework.boot.context.properties.EnableConfigurationProperties
import org.springframework.cloud.openfeign.EnableFeignClients;
import org.springframework.context.annotation.Import;
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
@Import({DefaultWebMvcConfiguration.class})
@EnableFeignClients(basePackageClasses = RulesClient.class)
@EnableConfigurationProperties(RedactionServiceSettings.class)
@ -19,8 +18,9 @@ import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettin
public class Application {
public static void main(String[] args) {
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
SpringApplication.run(Application.class, args);
}
}
}

View File

@ -1,20 +1,19 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class Document {
@ -33,7 +32,7 @@ public class Document {
private List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
private SectionGrid sectionGrid = new SectionGrid();
private long dictionaryVersion;
private DictionaryVersion dictionaryVersion;
private long rulesVersion;
private List<SectionText> sectionText = new ArrayList<>();

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import lombok.Getter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@ -7,38 +9,35 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import lombok.Getter;
public class FloatFrequencyCounter
{
public class FloatFrequencyCounter {
@Getter
Map<Float, Integer> countPerValue = new HashMap<>();
public void add(float value){
if(!countPerValue.containsKey(value)){
public void add(float value) {
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
} else {
countPerValue.put(value, countPerValue.get(value) + 1);
}
}
public void addAll(Map<Float, Integer> otherCounter){
for(Map.Entry<Float, Integer> entry: otherCounter.entrySet()){
if(countPerValue.containsKey(entry.getKey())){
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey())+ entry.getValue());
public void addAll(Map<Float, Integer> otherCounter) {
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else {
countPerValue.put(entry.getKey(), entry.getValue());
}
}
}
public Float getMostPopular(){
public Float getMostPopular() {
Map.Entry<Float, Integer> mostPopular = null;
for(Map.Entry<Float, Integer> entry: countPerValue.entrySet()){
if(mostPopular == null){
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null) {
mostPopular = entry;
} else if(entry.getValue() >= mostPopular.getValue()){
} else if (entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
}
@ -46,12 +45,11 @@ public class FloatFrequencyCounter
}
public List<Float> getHighterThanMostPopular(){
public List<Float> getHighterThanMostPopular() {
Float mostPopular = getMostPopular();
List<Float> higher = new ArrayList<>();
for(Float value: countPerValue.keySet()){
if(value > mostPopular){
for (Float value : countPerValue.keySet()) {
if (value > mostPopular) {
higher.add(value);
}
}
@ -60,12 +58,12 @@ public class FloatFrequencyCounter
}
public Float getHighest(){
public Float getHighest() {
Float highest = null;
for(Float value: countPerValue.keySet()){
if (highest == null){
for (Float value : countPerValue.keySet()) {
if (highest == null) {
highest = value;
} else if(value > highest){
} else if (value > highest) {
highest = value;
}
}

View File

@ -1,19 +1,19 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.List;
@Data
@AllArgsConstructor
public class Footer {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
@ -21,4 +21,4 @@ public class Footer {
return searchableText;
}
}
}

View File

@ -1,19 +1,19 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.List;
@Data
@AllArgsConstructor
public class Header {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
@ -21,4 +21,4 @@ public class Header {
return searchableText;
}
}
}

View File

@ -1,15 +1,14 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import java.util.List;
@Data
@RequiredArgsConstructor
public class Page {
@ -37,4 +36,4 @@ public class Page {
return rotation != 0;
}
}
}

View File

@ -1,19 +1,18 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@NoArgsConstructor
public class Paragraph implements Comparable{
public class Paragraph implements Comparable {
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
private List<PdfImage> images = new ArrayList<>();
@ -62,4 +61,4 @@ public class Paragraph implements Comparable{
return 0;
}
}
}

View File

@ -0,0 +1,52 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.*;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SectionText {
private int sectionNumber;
private String text;
private boolean isTable;
private String headline;
private List<SectionArea> sectionAreas = new ArrayList<>();
private Set<Image> images = new HashSet<>();
private List<TextBlock> textBlocks = new ArrayList<>();
private Map<String, CellValue> tabularData = new HashMap<>();
private List<Integer> cellStarts = new ArrayList<>();
public void setTabularData(Map<String, CellValue> tabularData) {
tabularData.remove(null);
this.tabularData = tabularData;
}
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> {
if (block != null) {
searchableText.addAll(block.getSequences());
}
});
return searchableText;
}
}

View File

@ -1,10 +1,10 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import lombok.Getter;
import java.util.HashMap;
import java.util.Map;
import lombok.Getter;
public class StringFrequencyCounter {
@Getter
@ -46,4 +46,4 @@ public class StringFrequencyCounter {
return mostPopular != null ? mostPopular.getKey() : null;
}
}
}

View File

@ -1,17 +1,18 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
package com.iqser.red.service.redaction.v1.server.classification.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Text {
private int numberOfPages;
private List<SectionText> sectionTexts = new ArrayList<>();
}

View File

@ -1,19 +1,21 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@AllArgsConstructor
@Builder
@Data
@NoArgsConstructor
public class TextBlock extends AbstractTextContainer {
@Builder.Default
@ -98,7 +100,6 @@ public class TextBlock extends AbstractTextContainer {
}
@Override
public String toString() {
@ -118,6 +119,7 @@ public class TextBlock extends AbstractTextContainer {
}
@Override
@JsonIgnore
public String getText() {
StringBuilder sb = new StringBuilder();
@ -139,4 +141,4 @@ public class TextBlock extends AbstractTextContainer {
}
}
}

View File

@ -1,19 +1,19 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.List;
@Data
@AllArgsConstructor
public class UnclassifiedText {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
@ -21,4 +21,4 @@ public class UnclassifiedText {
return searchableText;
}
}
}

View File

@ -1,21 +1,20 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
@Service
@SuppressWarnings("all")

View File

@ -1,19 +1,17 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
import java.util.List;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.List;
import java.util.regex.Pattern;
@Slf4j
@Service

View File

@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.classification.utils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.experimental.UtilityClass;
@UtilityClass

View File

@ -0,0 +1,9 @@
package com.iqser.red.service.redaction.v1.server.client;
import com.iqser.red.service.file.management.v1.api.resources.FileStatusProcessingUpdateResource;
import org.springframework.cloud.openfeign.FeignClient;
@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${file-management-service.url}")
public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource {
}

View File

@ -1,16 +1,16 @@
package com.iqser.red.service.redaction.v1.server.client;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.springframework.lang.NonNull;
import org.springframework.lang.Nullable;
import org.springframework.util.Assert;
import org.springframework.util.FileCopyUtils;
import org.springframework.web.multipart.MultipartFile;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
public class MockMultipartFile implements MultipartFile {
private final String name;
@ -22,13 +22,13 @@ public class MockMultipartFile implements MultipartFile {
public MockMultipartFile(String name, @Nullable byte[] content) {
this(name, "", (String) null, (byte[]) content);
this(name, "", null, content);
}
public MockMultipartFile(String name, InputStream contentStream) throws IOException {
this(name, "", (String) null, (byte[]) FileCopyUtils.copyToByteArray(contentStream));
this(name, "", null, FileCopyUtils.copyToByteArray(contentStream));
}
@ -78,7 +78,7 @@ public class MockMultipartFile implements MultipartFile {
public long getSize() {
return (long) this.content.length;
return this.content.length;
}

View File

@ -3,6 +3,6 @@ package com.iqser.red.service.redaction.v1.server.client;
import com.iqser.red.service.configuration.v1.api.resource.RulesResource;
import org.springframework.cloud.openfeign.FeignClient;
@FeignClient(name = RulesResource.SERVICE_NAME, url = "${configuration-service.url}")
@FeignClient(name = "RulesResource", url = "${configuration-service.url}")
public interface RulesClient extends RulesResource {
}
}

View File

@ -1,17 +1,15 @@
package com.iqser.red.service.redaction.v1.server.controller;
import java.time.OffsetDateTime;
import com.iqser.red.commons.spring.ErrorMessage;
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
import lombok.extern.slf4j.Slf4j;
import org.springframework.http.HttpStatus;
import org.springframework.web.bind.annotation.ExceptionHandler;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.bind.annotation.ResponseStatus;
import org.springframework.web.bind.annotation.RestControllerAdvice;
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
import lombok.extern.slf4j.Slf4j;
import java.time.OffsetDateTime;
@Slf4j
@RestControllerAdvice
@ -38,4 +36,4 @@ public class ControllerAdvice {
return new ErrorMessage(OffsetDateTime.now(), e.getMessage());
}
}
}

View File

@ -1,17 +1,10 @@
package com.iqser.red.service.redaction.v1.server.controller;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.model.Text;
import com.iqser.red.service.redaction.v1.resources.RedactionResource;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
@ -19,27 +12,21 @@ import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationService;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService;
import com.iqser.red.service.redaction.v1.server.redaction.service.EntityRedactionService;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionLogCreatorService;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RestController;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
@Slf4j
@RestController
@ -47,61 +34,24 @@ import java.util.List;
public class RedactionController implements RedactionResource {
private final PdfVisualisationService pdfVisualisationService;
private final PdfSegmentationService pdfSegmentationService;
private final RedactionLogCreatorService redactionLogCreatorService;
private final EntityRedactionService entityRedactionService;
private final DroolsExecutionService droolsExecutionService;
private final DictionaryService dictionaryService;
private final AnnotationService annotationService;
private final ReanalyzeService reanalyzeService;
private final ImageClassificationService imageClassificationService;
@Override
public AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest) {
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(analyzeRequest.getDocument()))) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
log.info("Document structure analysis successful, starting redaction analysis...");
imageClassificationService.classifyImages(classifiedDoc);
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
redactionLogCreatorService.createRedactionLog(classifiedDoc, pdDocument.getNumberOfPages(), analyzeRequest.getManualRedactions(), analyzeRequest
.getRuleSetId());
log.info("Redaction analysis successful...");
return AnalyzeResult.builder()
.sectionGrid(classifiedDoc.getSectionGrid())
.redactionLog(new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion(), classifiedDoc
.getRulesVersion(), analyzeRequest.getRuleSetId()))
.numberOfPages(classifiedDoc.getPages().size())
.text(new Text(classifiedDoc.getSectionText()))
.build();
} catch (Exception e) {
throw new RedactionException(e);
}
}
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
return reanalyzeService.reanalyze(renalyzeRequest);
}
private final PdfSegmentationService pdfSegmentationService;
private final RedactionStorageService redactionStorageService;
public AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest) {
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(annotateRequest.getDocument()))) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(annotateRequest.getProjectId(), annotateRequest.getFileId(), FileType.ORIGIN));
var redactionLog = redactionStorageService.getRedactionLog(annotateRequest.getProjectId(), annotateRequest.getFileId());
var sectionsGrid = redactionStorageService.getSectionGrid(annotateRequest.getProjectId(), annotateRequest.getFileId());
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
pdDocument.setAllSecurityToBeRemoved(true);
dictionaryService.updateDictionary(annotateRequest.getRedactionLog().getRuleSetId());
annotationService.annotate(pdDocument, annotateRequest.getRedactionLog(), annotateRequest.getSectionGrid());
dictionaryService.updateDictionary(redactionLog.getRuleSetId(), annotateRequest.getProjectId());
annotationService.annotate(pdDocument, redactionLog, sectionsGrid);
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
pdDocument.save(byteArrayOutputStream);
@ -115,65 +65,80 @@ public class RedactionController implements RedactionResource {
@Override
public RedactionResult classify(@RequestBody RedactionRequest pdfSegmentationRequest) {
public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try {
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(pdfSegmentationRequest.getDocument()))) {
pdDocument.setAllSecurityToBeRemoved(true);
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
return convert(pdDocument, classifiedDoc.getPages().size(), pdfSegmentationRequest.getRuleSetId());
return convert(pdDocument, classifiedDoc.getPages().size());
} catch (IOException e) {
throw new RedactionException(e);
}
} catch (IOException e) {
throw new RedactionException(e);
}
}
@Override
public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try {
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
pdDocument.setAllSecurityToBeRemoved(true);
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
return convert(pdDocument, classifiedDoc.getPages().size());
return convert(pdDocument, classifiedDoc.getPages().size(), redactionRequest.getRuleSetId());
} catch (IOException e) {
throw new RedactionException(e);
}
} catch (IOException e) {
throw new RedactionException(e);
}
}
@Override
public RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest) {
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc;
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
StringBuilder sb = new StringBuilder();
for (Page page : classifiedDoc.getPages()) {
for (AbstractTextContainer textContainer : page.getTextBlocks()) {
if (textContainer instanceof Table) {
Table table = (Table) textContainer;
sb.append(table.getTextAsHtml()).append("<br />").append("<br />");
}
}
}
return RedactionResult.builder().document(sb.toString().getBytes()).build();
} catch (IOException e) {
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true);
} catch (Exception e) {
throw new RedactionException(e);
}
StringBuilder sb = new StringBuilder();
for (Page page : classifiedDoc.getPages()) {
for (AbstractTextContainer textContainer : page.getTextBlocks()) {
if (textContainer instanceof Table) {
Table table = (Table) textContainer;
sb.append(table.getTextAsHtml()).append("<br />").append("<br />");
}
}
}
return RedactionResult.builder().document(sb.toString().getBytes()).build();
}
@ -191,26 +156,17 @@ public class RedactionController implements RedactionResource {
}
private RedactionResult convert(PDDocument document, int numberOfPages, String ruleSetId) throws IOException {
return convert(document, numberOfPages, null, null, 0, 0, ruleSetId);
}
private RedactionResult convert(PDDocument document, int numberOfPages,
List<RedactionLogEntry> redactionLogEntities, SectionGrid sectionGrid,
long dictionaryVersion, long rulesVersion, String ruleSetId) throws IOException {
private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException {
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
document.save(byteArrayOutputStream);
return RedactionResult.builder()
.document(byteArrayOutputStream.toByteArray())
.numberOfPages(numberOfPages)
.redactionLog(new RedactionLog(redactionLogEntities, dictionaryVersion, rulesVersion, ruleSetId))
.sectionGrid(sectionGrid)
.build();
}
}
}

View File

@ -0,0 +1,52 @@
package com.iqser.red.service.redaction.v1.server.memory;
import lombok.extern.slf4j.Slf4j;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
@Slf4j
public class MemoryStats {
public static void printMemoryStats() {
log.info("\n\n ------------------------------ \n" +
" Used Memory: " + humanReadableByteCountBin(getUsedMemory()) + "\n" +
" Free Memory: " + humanReadableByteCountBin(getFreeMemory()) + "\n" +
" Total Memory: " + humanReadableByteCountBin(getTotalMemory()) + "\n" +
" Max Memory: " + humanReadableByteCountBin(getMaxMemory()) + "\n" +
"\n ------------------------------ \n");
}
public static String humanReadableByteCountBin(long bytes) {
long absB = bytes == Long.MIN_VALUE ? Long.MAX_VALUE : Math.abs(bytes);
if (absB < 1024) {
return bytes + " B";
}
long value = absB;
CharacterIterator ci = new StringCharacterIterator("KMGTPE");
for (int i = 40; i >= 0 && absB > 0xfffccccccccccccL >> i; i -= 10) {
value >>= 10;
ci.next();
}
value *= Long.signum(bytes);
return String.format("%.1f %ciB", value / 1024.0, ci.current());
}
private static long getMaxMemory() {
return Runtime.getRuntime().maxMemory();
}
private static long getUsedMemory() {
return getMaxMemory() - getFreeMemory();
}
private static long getTotalMemory() {
return Runtime.getRuntime().totalMemory();
}
private static long getFreeMemory() {
return Runtime.getRuntime().freeMemory();
}
}

View File

@ -1,17 +1,15 @@
package com.iqser.red.service.redaction.v1.server.parsing;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
public class PDFAreaTextStripper extends PDFTextStripperByArea {
@Getter
@ -76,7 +74,7 @@ public class PDFAreaTextStripper extends PDFTextStripperByArea {
}
public void clearPositions(){
public void clearPositions() {
textPositionSequences = new ArrayList<>();
}

View File

@ -1,33 +1,16 @@
package com.iqser.red.service.redaction.v1.server.parsing;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.reflect.FieldUtils;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
import org.apache.pdfbox.contentstream.operator.color.*;
import org.apache.pdfbox.contentstream.operator.state.*;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
@ -40,40 +23,31 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
@Setter
protected PDPage pdpage;
@Getter
private int minCharWidth;
@Getter
private int maxCharWidth;
@Getter
private int minCharHeight;
@Getter
private int maxCharHeight;
@Getter
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@Getter
private final List<Ruling> rulings = new ArrayList<>();
private final List<Ruling> graphicsPath = new ArrayList<>();
@Setter
protected PDPage pdpage;
@Getter
private int minCharWidth;
@Getter
private int maxCharWidth;
@Getter
private int minCharHeight;
@Getter
private int maxCharHeight;
@Getter
private List<PdfImage> images = new ArrayList<>();
@ -222,6 +196,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds
.getWidth(), (float) imageBounds.getHeight());
// Memory Hack - sofReference kills me
FieldUtils.writeField(pdfImage, "cachedImageSubsampling", -1, true);
if (rect.getHeight() > 2 && rect.getWidth() > 2) {
this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber));
}
@ -369,4 +346,4 @@ public class PDFLinesTextStripper extends PDFTextStripper {
}
}
}

View File

@ -0,0 +1,52 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
@Data
@NoArgsConstructor
public class RedTextPosition {
private String textMatrix;
private int rotation;
private float y;
private float pageHeight;
private float pageWidth;
private String unicode;
private float XDirAdj;
private float YDirAdj;
private float width;
private float heightDir;
// not used in reanalysis
@JsonIgnore
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
private String fontName;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setTextMatrix(textPosition.getTextMatrix().toString());
return pos;
}
}

View File

@ -1,32 +1,52 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.pdfbox.text.TextPosition;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import java.util.stream.Collectors;
@Data
@RequiredArgsConstructor
@NoArgsConstructor
@JsonIgnoreProperties({ "empty" })
public class TextPositionSequence implements CharSequence {
private List<TextPosition> textPositions = new ArrayList<>();
private int page;
private List<RedTextPosition> textPositions = new ArrayList<>();
private final int page;
private float x1;
private float x2;
public TextPositionSequence(int page) {
this.page = page;
}
public static TextPositionSequence fromData(List<RedTextPosition> textPositions, int page) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions;
textPositionSequence.page = page;
return textPositionSequence;
}
public TextPositionSequence(List<TextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
}
@Override
public int length() {
@ -37,7 +57,7 @@ public class TextPositionSequence implements CharSequence {
@Override
public char charAt(int index) {
TextPosition textPosition = textPositionAt(index);
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
@ -45,7 +65,7 @@ public class TextPositionSequence implements CharSequence {
public char charAt(int index, boolean caseInSensitive) {
TextPosition textPosition = textPositionAt(index);
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@ -54,7 +74,7 @@ public class TextPositionSequence implements CharSequence {
@Override
public TextPositionSequence subSequence(int start, int end) {
return new TextPositionSequence(textPositions.subList(start, end), page);
return fromData(textPositions.subList(start, end), page);
}
@ -69,18 +89,25 @@ public class TextPositionSequence implements CharSequence {
}
public TextPosition textPositionAt(int index) {
public RedTextPosition textPositionAt(int index) {
return textPositions.get(index);
}
public void add(TextPosition textPosition) {
public void add(RedTextPosition textPosition) {
this.textPositions.add(textPosition);
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
}
@JsonIgnore
public float getX1() {
if (textPositions.get(0).getRotation() == 90) {
@ -91,6 +118,7 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getX2() {
if (textPositions.get(0).getRotation() == 90) {
@ -101,13 +129,14 @@ public class TextPositionSequence implements CharSequence {
}
}
@JsonIgnore
public float getRotationAdjustedY() {
return textPositions.get(0).getY();
}
@JsonIgnore
public float getY1() {
if (textPositions.get(0).getRotation() == 90) {
@ -118,6 +147,7 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getY2() {
if (textPositions.get(0).getRotation() == 90) {
@ -128,38 +158,40 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + 2;
}
@JsonIgnore
public float getHeight() {
return getY2() - getY1();
}
@JsonIgnore
public float getWidth() {
return getX2() - getX1();
}
@JsonIgnore
public String getFont() {
return textPositions.get(0)
.getFont()
.toString()
return textPositions.get(0).getFontName()
.toLowerCase()
.replaceAll(",bold", "")
.replaceAll(",italic", "");
}
@JsonIgnore
public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase();
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";
@ -173,25 +205,25 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
@JsonIgnore
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
@JsonIgnore
public int getRotation() {
return textPositions.get(0).getRotation();
}
@JsonIgnore
public Rectangle getRectangle() {
float height = getTextHeight();
@ -223,4 +255,4 @@ public class TextPositionSequence implements CharSequence {
return new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
}
}
}

View File

@ -0,0 +1,35 @@
package com.iqser.red.service.redaction.v1.server.queue;
import lombok.RequiredArgsConstructor;
import org.springframework.amqp.core.Queue;
import org.springframework.amqp.core.QueueBuilder;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@Configuration
@RequiredArgsConstructor
public class MessagingConfiguration {
public static final String REDACTION_QUEUE = "redactionQueue";
public static final String REDACTION_DQL = "redactionDQL";
@Bean
public Queue redactionQueue() {
return QueueBuilder.durable(REDACTION_QUEUE)
.withArgument("x-dead-letter-exchange", "")
.withArgument("x-dead-letter-routing-key", REDACTION_QUEUE)
.maxPriority(2)
.build();
}
@Bean
public Queue redactionDeadLetterQueue() {
return QueueBuilder.durable(REDACTION_DQL).build();
}
}

View File

@ -0,0 +1,54 @@
package com.iqser.red.service.redaction.v1.server.queue;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.stereotype.Service;
import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfiguration.REDACTION_DQL;
import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfiguration.REDACTION_QUEUE;
@Slf4j
@Service
@RequiredArgsConstructor
public class RedactionMessageReceiver {
private final ObjectMapper objectMapper;
private final ReanalyzeService reanalyzeService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
@RabbitHandler
@RabbitListener(queues = REDACTION_QUEUE)
public void receiveAnalyzeRequest(String in) throws JsonProcessingException {
var analyzeRequest = objectMapper.readValue(in, AnalyzeRequest.class);
log.info("Processing analyze request: {}", analyzeRequest);
AnalyzeResult result;
if (analyzeRequest.isReanalyseOnlyIfPossible()) {
result = reanalyzeService.reanalyze(analyzeRequest);
} else {
result = reanalyzeService.analyze(analyzeRequest);
}
log.info("Successfully analyzed {}", analyzeRequest);
fileStatusProcessingUpdateClient.analysisSuccessful(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), result);
}
@RabbitHandler
@RabbitListener(queues = REDACTION_DQL)
public void receiveAnalyzeRequestDQL(String in) throws JsonProcessingException {
var analyzeRequest = objectMapper.readValue(in, AnalyzeRequest.class);
log.info("Failed to process analyze request: {}", analyzeRequest);
fileStatusProcessingUpdateClient.analysisFailed(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
}
}

View File

@ -1,22 +1,25 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.Iterator;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.Value;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@Value
@Data
@NoArgsConstructor
@AllArgsConstructor
public class CellValue {
private List<TextBlock> textBlocks;
private List<TextBlock> textBlocks = new ArrayList<>();
private int rowSpanStart;
@Override
public String toString() {
@ -47,4 +50,4 @@ public class CellValue {
.replaceAll(" {2}", " ");
}
}
}

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import lombok.Data;
import lombok.Getter;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.Data;
import lombok.Getter;
@Data
public class Dictionary {
@ -18,18 +18,18 @@ public class Dictionary {
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
@Getter
private long version;
private DictionaryVersion version;
public Dictionary(List<DictionaryModel> dictionaryModels, long dictionaryVersion){
public Dictionary(List<DictionaryModel> dictionaryModels, DictionaryVersion version) {
this.dictionaryModels = dictionaryModels;
this.dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), dm));
this.version = dictionaryVersion;
this.version = version;
}
public int getDictionaryRank(String type){
if(!localAccessMap.containsKey(type)){
public int getDictionaryRank(String type) {
if (!localAccessMap.containsKey(type)) {
return 0;
}
return localAccessMap.get(type).getRank();
@ -60,7 +60,7 @@ public class Dictionary {
public boolean containsValue(String type, String value) {
if (localAccessMap.containsKey(type) && localAccessMap.get(type)
return localAccessMap.containsKey(type) && localAccessMap.get(type)
.getEntries()
.contains(value) || localAccessMap.containsKey(type) && localAccessMap.get(type)
.getLocalEntries()
@ -68,10 +68,7 @@ public class Dictionary {
.getEntries()
.contains(value) || localAccessMap.containsKey(RECOMMENDATION_PREFIX + type) && localAccessMap.get(RECOMMENDATION_PREFIX + type)
.getLocalEntries()
.contains(value)) {
return true;
}
return false;
.contains(value);
}

View File

@ -1,15 +1,15 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.Set;
@Data
@AllArgsConstructor
public class DictionaryIncrement {
private Set<DictionaryIncrementValue> values;
private long dictionaryVersion;
private DictionaryVersion dictionaryVersion;
}

View File

@ -1,15 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.io.Serializable;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class DictionaryModel implements Serializable {
@ -22,9 +21,10 @@ public class DictionaryModel implements Serializable {
private boolean recommendation;
private Set<DictionaryEntry> entries;
private Set<String> localEntries;
private boolean isDossierDictionary;
public Set<String> getValues(boolean local){
return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e-> e.getValue()).collect(Collectors
public Set<String> getValues(boolean local) {
return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e -> e.getValue()).collect(Collectors
.toSet());
}

View File

@ -20,5 +20,4 @@ public class DictionaryRepresentation {
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
}

View File

@ -0,0 +1,16 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class DictionaryVersion {
long rulesetVersion;
long dossierVersion;
}

View File

@ -1,13 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Data;
import lombok.EqualsAndHashCode;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Entity {
@ -38,8 +37,10 @@ public class Entity {
private String textBefore;
private String textAfter;
private boolean isDossierDictionaryEntry;
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter, Integer start, Integer end) {
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter, Integer start, Integer end, boolean isDossierDictionaryEntry) {
this.word = word;
this.type = type;
@ -55,10 +56,11 @@ public class Entity {
this.textAfter = textAfter;
this.start = start;
this.end = end;
this.isDossierDictionaryEntry = isDossierDictionaryEntry;
}
public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber, boolean isDictionaryEntry) {
public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber, boolean isDictionaryEntry, boolean isDossierDictionaryEntry) {
this.word = word;
this.type = type;
@ -67,6 +69,7 @@ public class Entity {
this.headline = headline;
this.sectionNumber = sectionNumber;
this.isDictionaryEntry = isDictionaryEntry;
this.isDossierDictionaryEntry = isDossierDictionaryEntry;
}
}

View File

@ -1,24 +1,23 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.RequiredArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@RequiredArgsConstructor
@AllArgsConstructor
@EqualsAndHashCode
public class EntityPositionSequence {
private final String id;
@EqualsAndHashCode.Exclude
private List<TextPositionSequence> sequences = new ArrayList<>();
private int pageNumber;
private final String id;
}

View File

@ -1,7 +1,5 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.awt.geom.Rectangle2D;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@ -14,7 +12,7 @@ import lombok.NoArgsConstructor;
public class Image {
private String type;
private Rectangle2D position;
private RedRectangle2D position;
private boolean redaction;
private String redactionReason;
private String legalBasis;

View File

@ -1,28 +1,31 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
@RequiredArgsConstructor
public class PdfImage {
@NonNull
@JsonIgnore
private BufferedImage image;
@NonNull
private Rectangle2D position;
private RedRectangle2D position;
private ImageType imageType;
private boolean isAppendedToParagraph;
@NonNull
private int page;
}
public PdfImage(BufferedImage image, Rectangle2D position, int page) {
this.image = image;
this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight());
this.page = page;
}
}

View File

@ -1,37 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class ReanalysisSection {
private int sectionNumber;
private String headline;
private List<TextBlock> textBlocks;
private Map<String, CellValue> tabularData = new HashMap<>();
private List<Integer> cellStarts;
private Set<Image> images = new HashSet<>();
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> {
if (block instanceof TextBlock) {
searchableText.addAll(block.getSequences());
}
});
return searchableText;
}
}

View File

@ -0,0 +1,35 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class RedRectangle2D {
private double x;
private double y;
private double width;
private double height;
@JsonIgnore
public boolean isEmpty() {
return width <= 0.0f || height <= 0.0f;
}
public boolean contains(double x, double y, double w, double h) {
if (isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = getX();
double y0 = getY();
return x >= x0 &&
y >= y0 &&
(x + w) <= x0 + getWidth() &&
(y + h) <= y0 + getHeight();
}
}

View File

@ -1,14 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
public class SearchableText {
private final List<TextPositionSequence> sequences = new ArrayList<>();
@ -232,4 +232,4 @@ public class SearchableText {
return sb.append("\n").toString();
}
}
}

View File

@ -1,6 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import java.util.Collection;
import java.util.HashMap;
@ -11,15 +17,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
@Data
@Slf4j
@ -413,7 +411,7 @@ public class Section {
String text = caseInsensitive ? searchText.toLowerCase() : searchText;
String searchValue = caseInsensitive ? value.toLowerCase() : value;
Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, true);
Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, true, false);
found.forEach(entity -> {
if (redacted) {
@ -439,7 +437,7 @@ public class Section {
} else {
String word = value.toString();
Entity entity = new Entity(word, type, value.getRowSpanStart(), value.getRowSpanStart() + word.length(), headline, sectionNumber, false);
Entity entity = new Entity(word, type, value.getRowSpanStart(), value.getRowSpanStart() + word.length(), headline, sectionNumber, false, false);
entity.setRedaction(redact);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);

View File

@ -0,0 +1,48 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionChangeLog;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import org.springframework.stereotype.Service;
@Service
public class AnalyzeResponseService {
public AnalyzeResult createAnalyzeResponse(String projectId, String fileId, long duration, int pageCount, RedactionLog redactionLog, RedactionChangeLog redactionChangeLog) {
boolean hasHints = redactionLog.getRedactionLogEntry().stream().anyMatch(RedactionLogEntry::isHint);
boolean hasRequests = redactionLog.getRedactionLogEntry()
.stream()
.anyMatch(entry -> entry.isManual() && entry.getStatus()
.equals(com.iqser.red.service.redaction.v1.model.Status.REQUESTED));
boolean hasRedactions = redactionLog.getRedactionLogEntry()
.stream()
.anyMatch(entry -> entry.isRedacted() && !entry.isManual() || entry.isManual() && entry.getStatus()
.equals(com.iqser.red.service.redaction.v1.model.Status.APPROVED));
boolean hasImages = redactionLog.getRedactionLogEntry()
.stream()
.anyMatch(entry -> entry.isHint() && entry.getType().equals("image"));
boolean hasUpdates = redactionChangeLog != null && redactionChangeLog.getRedactionLogEntry() != null && !redactionChangeLog
.getRedactionLogEntry()
.isEmpty() && redactionChangeLog.getRedactionLogEntry().stream().anyMatch(entry -> !entry.getType().equals("false_positive"));
return AnalyzeResult.builder()
.projectId(projectId)
.fileId(fileId)
.duration(duration)
.numberOfPages(pageCount)
.hasHints(hasHints)
.hasRedactions(hasRedactions)
.hasRequests(hasRequests)
.hasImages(hasImages)
.hasUpdates(hasUpdates)
.rulesVersion(redactionLog.getRulesVersion())
.dictionaryVersion(redactionLog.getDictionaryVersion())
.dossierDictionaryVersion(redactionLog.getDossierDictionaryVersion())
.build();
}
}

View File

@ -1,14 +1,7 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.awt.Color;
import java.io.IOException;
import java.util.ArrayList;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.model.*;
import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
@ -21,15 +14,14 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.CellRectangle;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
import lombok.RequiredArgsConstructor;
import java.awt.Color;
import java.io.IOException;
import java.util.ArrayList;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@Service
@RequiredArgsConstructor

View File

@ -1,19 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.awt.Color;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.springframework.stereotype.Service;
import static com.iqser.red.service.configuration.v1.api.resource.DictionaryResource.GLOBAL_DOSSIER;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
@ -25,10 +12,18 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncre
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import feign.FeignException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.springframework.stereotype.Service;
import java.awt.Color;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
@Service
@ -37,53 +32,69 @@ public class DictionaryService {
private final DictionaryClient dictionaryClient;
private Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>();
private final Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>();
private final Map<String, DictionaryRepresentation> dictionariesByDossier = new HashMap<>();
public long updateDictionary(String ruleSetId) {
public DictionaryVersion updateDictionary(String ruleSetId, String dossierId) {
long version = dictionaryClient.getVersion(ruleSetId);
var foundDictionary = dictionariesByRuleSets.get(ruleSetId);
if (foundDictionary == null || version > foundDictionary.getDictionaryVersion()) {
updateDictionaryEntry(ruleSetId, version);
long rulesetDictionaryVersion = dictionaryClient.getVersion(ruleSetId, GLOBAL_DOSSIER);
var rulesetDictionary = dictionariesByRuleSets.get(ruleSetId);
if (rulesetDictionary == null || rulesetDictionaryVersion > rulesetDictionary.getDictionaryVersion()) {
updateDictionaryEntry(ruleSetId, rulesetDictionaryVersion, GLOBAL_DOSSIER);
}
return version;
long dossierDictionaryVersion = dictionaryClient.getVersion(ruleSetId, dossierId);
var dossierDictionary = dictionariesByDossier.get(dossierId);
if (dossierDictionary == null || dossierDictionaryVersion > dossierDictionary.getDictionaryVersion()) {
updateDictionaryEntry(ruleSetId, dossierDictionaryVersion, dossierId);
}
return DictionaryVersion.builder().rulesetVersion(rulesetDictionaryVersion).dossierVersion(dossierDictionaryVersion).build();
}
public DictionaryIncrement getDictionaryIncrements(String ruleSetId, long fromVersion) {
public DictionaryIncrement getDictionaryIncrements(String ruleSetId, DictionaryVersion fromVersion, String dossierId) {
long version = updateDictionary(ruleSetId);
DictionaryVersion version = updateDictionary(ruleSetId, dossierId);
Set<DictionaryIncrementValue> newValues = new HashSet<>();
List<DictionaryModel> dictionaryModels = dictionariesByRuleSets.get(ruleSetId).getDictionary();
dictionaryModels.forEach(dictionaryModel -> {
dictionaryModel.getEntries().forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion) {
if (dictionaryEntry.getVersion() > fromVersion.getRulesetVersion()) {
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
}
});
});
if(dictionariesByDossier.containsKey(dossierId)) {
dictionaryModels = dictionariesByDossier.get(dossierId).getDictionary();
dictionaryModels.forEach(dictionaryModel -> {
dictionaryModel.getEntries().forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion.getDossierVersion()) {
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
}
});
});
}
return new DictionaryIncrement(newValues, version);
}
private void updateDictionaryEntry(String ruleSetId, long version) {
private void updateDictionaryEntry(String ruleSetId, long version, String dossierId) {
try {
DictionaryRepresentation dictionaryRepresentation = new DictionaryRepresentation();
TypeResponse typeResponse = dictionaryClient.getAllTypes(ruleSetId);
TypeResponse typeResponse = dictionaryClient.getAllTypes(ruleSetId, dossierId);
if (typeResponse != null && CollectionUtils.isNotEmpty(typeResponse.getTypes())) {
List<DictionaryModel> dictionary = typeResponse.getTypes()
.stream()
.map(t -> new DictionaryModel(t.getType(), t.getRank(), convertColor(t.getHexColor()), t.isCaseInsensitive(), t
.isHint(), t.isRecommendation(), convertEntries(t), new HashSet<>()))
.isHint(), t.isRecommendation(), convertEntries(t, dossierId), new HashSet<>(),dossierId.equals(GLOBAL_DOSSIER) ? false : true))
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
.collect(Collectors.toList());
@ -99,7 +110,11 @@ public class DictionaryService {
dictionaryRepresentation.setDictionaryVersion(version);
dictionaryRepresentation.setDictionary(dictionary);
dictionariesByRuleSets.put(ruleSetId, dictionaryRepresentation);
if(dossierId.equals(GLOBAL_DOSSIER)) {
dictionariesByRuleSets.put(ruleSetId, dictionaryRepresentation);
} else {
dictionariesByDossier.put(dossierId, dictionaryRepresentation);
}
}
} catch (FeignException e) {
log.warn("Got some unknown feignException", e);
@ -112,19 +127,19 @@ public class DictionaryService {
dictionary.getDictionaryModels().forEach(dm -> {
if (dm.isRecommendation() && !dm.getLocalEntries().isEmpty()) {
dictionaryClient.addEntries(dm.getType(), ruleSetId, new ArrayList<>(dm.getLocalEntries()), false);
long externalVersion = dictionaryClient.getVersion(ruleSetId);
if (externalVersion == dictionary.getVersion() + 1) {
dictionary.setVersion(externalVersion);
dictionaryClient.addEntries(dm.getType(), ruleSetId, new ArrayList<>(dm.getLocalEntries()), false, GLOBAL_DOSSIER);
long externalVersion = dictionaryClient.getVersion(ruleSetId, GLOBAL_DOSSIER);
if (externalVersion == dictionary.getVersion().getRulesetVersion() + 1) {
dictionary.getVersion().setRulesetVersion(externalVersion);
}
}
});
}
private Set<DictionaryEntry> convertEntries(TypeResult t) {
private Set<DictionaryEntry> convertEntries(TypeResult t, String dossierId) {
Set<DictionaryEntry> entries = new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId())
Set<DictionaryEntry> entries = new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId(), dossierId)
.getEntries());
if (t.isCaseInsensitive()) {
@ -181,17 +196,26 @@ public class DictionaryService {
}
public Dictionary getDeepCopyDictionary(String ruleSetId) {
public Dictionary getDeepCopyDictionary(String ruleSetId, String dossierId) {
List<DictionaryModel> copy = new ArrayList<>();
var representation = dictionariesByRuleSets.get(ruleSetId);
var dictionary = dictionariesByRuleSets.get(ruleSetId).getDictionary();
dictionary.forEach(dm -> {
var rulesetRepresentation = dictionariesByRuleSets.get(ruleSetId);
rulesetRepresentation.getDictionary().forEach(dm -> {
copy.add(SerializationUtils.clone(dm));
});
return new Dictionary(copy, representation.getDictionaryVersion());
//TODO merge dictionaries if they have same names
long dossierDictionaryVersion = -1;
if(dictionariesByDossier.containsKey(dossierId)) {
var dossierRepresentation = dictionariesByDossier.get(dossierId);
dossierRepresentation.getDictionary().forEach(dm -> {
copy.add(SerializationUtils.clone(dm));
});
dossierDictionaryVersion = dossierRepresentation.getDictionaryVersion();
}
return new Dictionary(copy, DictionaryVersion.builder().rulesetVersion(rulesetRepresentation.getDictionaryVersion()).dossierVersion(dossierDictionaryVersion).build());
}
@ -212,4 +236,4 @@ public class DictionaryService {
return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor();
}
}
}

View File

@ -1,11 +1,10 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import lombok.RequiredArgsConstructor;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.KieServices;
import org.kie.api.builder.KieBuilder;
@ -15,12 +14,11 @@ import org.kie.api.runtime.KieContainer;
import org.kie.api.runtime.KieSession;
import org.springframework.stereotype.Service;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import lombok.RequiredArgsConstructor;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
@Service
@RequiredArgsConstructor
@ -28,9 +26,9 @@ public class DroolsExecutionService {
private final RulesClient rulesClient;
private Map<String, KieContainer> kieContainers = new HashMap<>();
private final Map<String, KieContainer> kieContainers = new HashMap<>();
private Map<String, Long> rulesVersionPerRuleSetId = new HashMap<>();
private final Map<String, Long> rulesVersionPerRuleSetId = new HashMap<>();
public KieContainer getKieContainer(String ruleSetId) {
@ -133,4 +131,4 @@ public class DroolsExecutionService {
return rulesVersion.longValue();
}
}
}

View File

@ -1,50 +1,27 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.classification.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Slf4j
@Service
@ -56,13 +33,13 @@ public class EntityRedactionService {
private final SurroundingWordsService surroundingWordsService;
public void processDocument(Document classifiedDoc, String ruleSetId, ManualRedactions manualRedactions) {
public void processDocument(Document classifiedDoc, String ruleSetId, ManualRedactions manualRedactions, String dossierId) {
dictionaryService.updateDictionary(ruleSetId);
dictionaryService.updateDictionary(ruleSetId, dossierId);
KieContainer container = droolsExecutionService.updateRules(ruleSetId);
long rulesVersion = droolsExecutionService.getRulesVersion(ruleSetId);
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(ruleSetId);
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(ruleSetId, dossierId);
Set<Entity> documentEntities = new HashSet<>(findEntities(classifiedDoc, container, manualRedactions, dictionary, false, null));
@ -94,7 +71,7 @@ public class EntityRedactionService {
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd()));
.getStart(), entity.getEnd(), entity.isDossierDictionaryEntry()));
}
}
@ -210,6 +187,7 @@ public class EntityRedactionService {
.get(0)
.getPage());
sectionText.getSectionAreas().add(sectionArea);
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
int cellStart = start;
@ -258,6 +236,8 @@ public class EntityRedactionService {
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
sectionText.setTabularData(tabularData);
sectionText.setCellStarts(cellStarts);
classifiedDoc.getSectionText().add(sectionText);
}
@ -290,6 +270,7 @@ public class EntityRedactionService {
.getSequences()
.get(0)
.getPage());
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
sectionText.getSectionAreas().add(sectionArea);
}
@ -348,6 +329,10 @@ public class EntityRedactionService {
sectionText.setHeadline(headline);
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(false);
sectionText.setImages(images.stream()
.map(image -> convert(image, sectionNumber.intValue(), headline))
.collect(Collectors.toSet()));
sectionText.setTextBlocks(paragraphTextBlocks);
classifiedDoc.getSectionText().add(sectionText);
}
@ -386,9 +371,9 @@ public class EntityRedactionService {
String lowercaseInputString = searchableString.toLowerCase();
for (DictionaryModel model : dictionary.getDictionaryModels()) {
if (model.isCaseInsensitive()) {
found.addAll(EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local));
found.addAll(EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local, model.isDossierDictionary()));
} else {
found.addAll(EntitySearchUtils.find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local));
found.addAll(EntitySearchUtils.find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local, model.isDossierDictionary()));
}
}

View File

@ -1,21 +1,17 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import javax.imageio.ImageIO;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import javax.imageio.ImageIO;
import java.io.ByteArrayOutputStream;
@Slf4j
@Service
@ -26,37 +22,41 @@ public class ImageClassificationService {
private final RedactionServiceSettings settings;
public void classifyImages(Document classifiedDoc) {
public void classifyImages(Page page) {
long start = System.currentTimeMillis();
classifiedDoc.getPages().forEach(page -> {
page.getImages().forEach(image -> {
page.getImages().forEach(image -> {
if (settings.isEnableImageClassification()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos
.toByteArray()));
image.setImageType(ImageType.valueOf(response.getCategory()));
if (settings.isEnableImageClassification()) {
} catch (IOException e) {
log.error("Could not classify image", e);
}
} else {
long start = System.currentTimeMillis();
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray());
ImageClassificationResponse response = imageClassificationClient.classify(mockFile);
image.setImageType(ImageType.valueOf(response.getCategory()));
} catch (Exception e) {
log.error("Could not classify image", e);
image.setImageType(ImageType.OTHER);
}
if (image.getImageType().equals(ImageType.OTHER)) {
page.getTextBlocks().forEach(textblock -> {
if (image.getPosition()
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
image.setImageType(ImageType.OCR);
}
});
}
});
log.info("Image classification took: " + (System.currentTimeMillis() - start));
} else {
image.setImageType(ImageType.OTHER);
}
image.getImage().flush();
image.setImage(null);
if (image.getImageType().equals(ImageType.OTHER)) {
page.getTextBlocks().forEach(textblock -> {
if (image.getPosition()
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
image.setImageType(ImageType.OCR);
}
});
}
});
log.info("Image classification took: " + (System.currentTimeMillis() - start));
}
}

View File

@ -1,53 +1,30 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.awt.geom.Rectangle2D;
import java.io.ByteArrayInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import lombok.RequiredArgsConstructor;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Slf4j
@Service
@RequiredArgsConstructor
public class ReanalyzeService {
@ -57,39 +34,98 @@ public class ReanalyzeService {
private final SurroundingWordsService surroundingWordsService;
private final EntityRedactionService entityRedactionService;
private final RedactionLogCreatorService redactionLogCreatorService;
private final RedactionStorageService redactionStorageService;
private final PdfSegmentationService pdfSegmentationService;
private final RedactionChangeLogService redactionChangeLogService;
private final AnalyzeResponseService analyzeResponseService;
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), renalyzeRequest
.getRedactionLog()
.getDictionaryVersion());
long startTime = System.currentTimeMillis();
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
var pageCount = 0;
Document classifiedDoc;
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest
.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
pageCount = classifiedDoc.getPages().size();
} catch (Exception e) {
throw new RedactionException(e);
}
log.info("Document structure analysis successful, starting redaction analysis...");
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions(), analyzeRequest
.getProjectId());
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
.getRuleSetId());
log.info("Redaction analysis successful...");
var redactionLog = new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion()
.getRulesetVersion(), classifiedDoc.getRulesVersion(), analyzeRequest.getRuleSetId(), classifiedDoc.getDictionaryVersion()
.getDossierVersion());
log.info("Analyzed with rules {} and dictionary {} for ruleSet: {}", classifiedDoc.getRulesVersion(), classifiedDoc
.getDictionaryVersion(), analyzeRequest.getRuleSetId());
// first create changelog - this only happens when we migrate files analyzed via the old process and we don't want to loose changeLog data
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
// store redactionLog
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.TEXT, new Text(pageCount, classifiedDoc
.getSectionText()));
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc
.getSectionGrid());
long duration = System.currentTimeMillis() - startTime;
return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), duration, pageCount, redactionLog, changeLog);
}
@SneakyThrows
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
// not yet ready for reanalysis
if (redactionLog == null || text == null || text.getNumberOfPages() == 0) {
return analyze(analyzeRequest);
}
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), new DictionaryVersion(redactionLog
.getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getProjectId());
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions());
Map<String, List<Comment>> comments = null;
Set<ManualRedactionEntry> manualAdds = null;
if (renalyzeRequest.getManualRedactions() != null) {
if (analyzeRequest.getManualRedactions() != null) {
// TODO comments will be removed from redactionLog, so we ignore this first.
comments = renalyzeRequest.getManualRedactions().getComments();
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
comments = analyzeRequest.getManualRedactions().getComments();
manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd();
}
Set<Integer> sectionsToReanaylse = new HashSet<>();
Set<Integer> sectionsToReanalyse = new HashSet<>();
Map<Integer, Set<Image>> imageEntries = new HashMap<>();
for (RedactionLogEntry entry : renalyzeRequest.getRedactionLog().getRedactionLogEntry()) {
for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) {
if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) {
sectionsToReanaylse.add(entry.getSectionNumber());
sectionsToReanalyse.add(entry.getSectionNumber());
}
if (entry.isImage() || entry.getType().equals("image")) {
imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry));
}
}
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
for (SectionText sectionText : text.getSectionTexts()) {
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
sectionsToReanaylse.add(sectionText.getSectionNumber());
sectionsToReanalyse.add(sectionText.getSectionNumber());
}
if (manualAdds != null) {
@ -106,187 +142,123 @@ public class ReanalyzeService {
}
}
if (sectionsToReanaylse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest);
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement);
}
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(renalyzeRequest.getDocument()))) {
List<SectionText> reanalysisSections = new ArrayList<>();
List<ReanalysisSection> reanalysisSections = new ArrayList<>();
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
for (SectionText sectionText : text.getSectionTexts()) {
if (!sectionsToReanaylse.contains(sectionText.getSectionNumber())) {
continue;
}
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
reanalysisSections.add(sectionText);
}
}
ReanalysisSection reanalysisSection = new ReanalysisSection();
reanalysisSection.setHeadline(sectionText.getHeadline());
reanalysisSection.setSectionNumber(sectionText.getSectionNumber());
List<TextBlock> textBlocks = new ArrayList<>();
//--
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
.add(sectionArea);
}
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId());
Map<String, CellValue> tabularData = new HashMap<>();
List<Integer> cellStarts = new ArrayList<>();
for (Integer page : sectionAreasPerPage.keySet()) {
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId(), analyzeRequest.getProjectId());
PDPage pdPage = pdDocument.getPage(page - 1);
PDRectangle cropBox = pdPage.getCropBox();
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
textStripper.setPageNumber(page);
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
int cellStart = 0;
for (SectionArea sectionArea : areasOnPage) {
Rectangle2D rect = null;
if (pdPage.getRotation() == 90) {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
} else {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
.getHeight() + 0.001f);
}
textStripper.addRegion(String.valueOf(1), rect);
textStripper.extractRegions(pdPage);
textStripper.getTextForRegion(String.valueOf(1));
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
if (sectionText.isTable()) {
Cell cell = new Cell();
cell.addTextBlock(textBlock);
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
cellStarts.add(cellStart);
cellStart = cellStart + cell.toString().trim().length() + 1;
}
textBlocks.add(textBlock);
textStripper.clearPositions();
}
}
reanalysisSection.setTextBlocks(textBlocks);
reanalysisSection.setTabularData(tabularData);
if (sectionText.isTable()) {
reanalysisSection.setCellStarts(cellStarts);
}
if (imageEntries.containsKey(sectionText.getSectionNumber())) {
reanalysisSection.getImages().addAll(imageEntries.get(sectionText.getSectionNumber()));
}
reanalysisSections.add(reanalysisSection);
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
if (reanalysisSection.getCellStarts() != null) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
//--
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.build(), reanalysisSection.getSearchableText()));
}
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
Set<Entity> entities = new HashSet<>();
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
entities.addAll(analysedRowSection.getEntities());
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (ReanalysisSection reanalysisSection : reanalysisSections) {
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
if (reanalysisSection.getCellStarts() != null) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.build(), reanalysisSection.getSearchableText()));
for (Image image : analysedRowSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
Set<Entity> entities = new HashSet<>();
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
.getSection());
entities.addAll(analysedRowSection.getEntities());
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
});
for (Image image : analysedRowSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
});
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
.add(entityPositionSequence);
}
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd()));
}
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
.add(entityPositionSequence);
}
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= pdDocument.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
}
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd(), entity.isDossierDictionaryEntry()));
}
}
if (imagesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
}
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= text.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest.getManualRedactions(), page, analyzeRequest
.getRuleSetId()));
}
Iterator<RedactionLogEntry> itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator();
while (itty.hasNext()) {
RedactionLogEntry entry = itty.next();
if (sectionsToReanaylse.contains(entry.getSectionNumber())) {
itty.remove();
}
if (imagesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest.getManualRedactions(), page, analyzeRequest
.getRuleSetId()));
}
renalyzeRequest.getRedactionLog().getRedactionLogEntry().addAll(newRedactionLogEntries);
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
} catch (Exception e) {
throw new RedactionException(e);
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest
.getRuleSetId()));
}
redactionLog.getRedactionLogEntry()
.removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry
.isImage());
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement);
}
private AnalyzeResult finalizeAnalysis(@RequestBody AnalyzeRequest analyzeRequest, long startTime,
RedactionLog redactionLog, Text text,
DictionaryIncrement dictionaryIncrement) {
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getRulesetVersion());
redactionLog.setDossierDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getDossierVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
long duration = System.currentTimeMillis() - startTime;
return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), duration, text
.getNumberOfPages(), redactionLog, changeLog);
}
@ -309,7 +281,7 @@ public class ReanalyzeService {
return Image.builder()
.type(entry.getType())
.position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft()
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
.getY(), position.getWidth(), position.getHeight()))
.sectionNumber(entry.getSectionNumber())
.section(entry.getSection())

View File

@ -0,0 +1,93 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.ChangeType;
import com.iqser.red.service.redaction.v1.model.RedactionChangeLog;
import com.iqser.red.service.redaction.v1.model.RedactionChangeLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@Slf4j
@Service
@RequiredArgsConstructor
public class RedactionChangeLogService {
private final RedactionStorageService redactionStorageService;
public RedactionChangeLog createAndStoreChangeLog(String projectId, String fileId, RedactionLog currentRedactionLog) {
try {
RedactionLog previousRedactionLog = redactionStorageService.getRedactionLog(projectId, fileId);
var changeLog = createChangeLog(currentRedactionLog, previousRedactionLog);
redactionStorageService.storeObject(projectId, fileId, FileType.REDACTION_CHANGELOG, changeLog);
return changeLog;
} catch (Exception e) {
log.debug("Previous redaction log not available");
return null;
}
}
private RedactionChangeLog createChangeLog(RedactionLog currentRedactionLog, RedactionLog previousRedactionLog) {
if (previousRedactionLog == null) {
return null;
}
List<RedactionLogEntry> added = new ArrayList<>(currentRedactionLog.getRedactionLogEntry());
added.removeAll(previousRedactionLog.getRedactionLogEntry());
List<RedactionLogEntry> removed = new ArrayList<>(previousRedactionLog.getRedactionLogEntry());
removed.removeAll(currentRedactionLog.getRedactionLogEntry());
List<RedactionChangeLogEntry> changeLogEntries = added.stream()
.map(entry -> convert(entry, ChangeType.ADDED))
.collect(Collectors.toList());
changeLogEntries.addAll(removed.stream()
.map(entry -> convert(entry, ChangeType.REMOVED))
.collect(Collectors.toList()));
return new RedactionChangeLog(changeLogEntries, currentRedactionLog.getDictionaryVersion(), currentRedactionLog.getRulesVersion(), currentRedactionLog
.getRuleSetId());
}
private RedactionChangeLogEntry convert(RedactionLogEntry entry, ChangeType changeType) {
return RedactionChangeLogEntry.builder()
.id(entry.getId())
.type(entry.getType())
.value(entry.getValue())
.reason(entry.getReason())
.matchedRule(entry.getMatchedRule())
.legalBasis(entry.getLegalBasis())
.redacted(entry.isRedacted())
.isHint(entry.isHint())
.isRecommendation(entry.isRecommendation())
.section(entry.getSection())
.color(entry.getColor())
.positions(entry.getPositions())
.sectionNumber(entry.getSectionNumber())
.manual(entry.isManual())
.status(entry.getStatus())
.manualRedactionType(entry.getManualRedactionType())
.isDictionaryEntry(entry.isDictionaryEntry())
.textBefore(entry.getTextBefore())
.textAfter(entry.getTextAfter())
.comments(entry.getComments())
.changeType(changeType)
.isDossierDictionaryEntry(entry.isDossierDictionaryEntry())
.build();
}
}

View File

@ -1,31 +1,10 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.CellRectangle;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactionType;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
import com.iqser.red.service.redaction.v1.model.Status;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
@ -34,8 +13,16 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@Service
@RequiredArgsConstructor
@ -285,24 +272,24 @@ public class RedactionLogCreatorService {
}
private List<Rectangle> getRectanglesPerLine(List<TextPosition> textPositions, int page) {
private List<Rectangle> getRectanglesPerLine(List<RedTextPosition> textPositions, int page) {
List<Rectangle> rectangles = new ArrayList<>();
if (textPositions.size() == 1) {
rectangles.add(new TextPositionSequence(textPositions, page).getRectangle());
rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle());
} else {
float y = textPositions.get(0).getYDirAdj();
int startIndex = 0;
for (int i = 1; i < textPositions.size(); i++) {
float yDirAdj = textPositions.get(i).getYDirAdj();
if (yDirAdj != y) {
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle());
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle());
y = yDirAdj;
startIndex = i;
}
}
if (startIndex != textPositions.size()) {
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
}
}
@ -368,6 +355,7 @@ public class RedactionLogCreatorService {
.status(manualRedactionEntry.getStatus())
.manualRedactionType(ManualRedactionType.ADD)
.isDictionaryEntry(false)
.isDossierDictionaryEntry(manualRedactionEntry.isAddToDossierDictionary())
.build();
}
@ -391,6 +379,7 @@ public class RedactionLogCreatorService {
.textBefore(entity.getTextBefore())
.startOffset(entity.getStart())
.endOffset(entity.getEnd())
.isDossierDictionaryEntry(entity.isDossierDictionaryEntry())
.build();
}

View File

@ -1,25 +1,17 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Slf4j
@UtilityClass
@SuppressWarnings("PMD")
@ -46,7 +38,7 @@ public class EntitySearchUtils {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if(value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())){
if (value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())) {
return true;
}
}
@ -57,7 +49,7 @@ public class EntitySearchUtils {
public Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber,
boolean local) {
boolean local, boolean isDossierDictionary) {
Set<Entity> found = new HashSet<>();
@ -77,7 +69,7 @@ public class EntitySearchUtils {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local));
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary));
}
} while (startIndex > -1);
}
@ -147,16 +139,16 @@ public class EntitySearchUtils {
public void addEntitiesWithHigherRank(Set<Entity> entities, Entity found, Dictionary dictionary) {
if(entities.contains(found)){
if (entities.contains(found)) {
Entity existing = entities.stream().filter(entity -> entity.equals(found)).findFirst().get();
if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())){
if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) {
entities.remove(found);
}
}
entities.add(found);
}
public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found){
public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found) {
// HashSet keeps old value but we want the new.
entities.removeAll(found);
entities.addAll(found);

View File

@ -1,15 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.List;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import lombok.experimental.UtilityClass;
import java.nio.charset.StandardCharsets;
import java.util.List;
@UtilityClass
public class IdBuilder {
@ -26,14 +25,9 @@ public class IdBuilder {
}
public String buildId(Rectangle2D rectangle2D, int page){
StringBuilder sb = new StringBuilder();
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
public String buildId(RedRectangle2D rectangle2D, int page) {
return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString();
}
}

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import lombok.experimental.UtilityClass;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
@ -8,8 +10,6 @@ import java.nio.charset.StandardCharsets;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ResourceLoader {
@ -27,4 +27,4 @@ public class ResourceLoader {
}
}
}

View File

@ -7,6 +7,7 @@ public class TextNormalizationUtilities {
/**
* Revert hyphenation due to line breaks.
*
* @param text Text to be processed.
* @return Text without line-break hyphenation.
*/
@ -14,4 +15,4 @@ public class TextNormalizationUtilities {
return text.replaceAll("([^\\s\\d\\-]{2,})[\\-\\u00AD]\\R|\n\r(.+ )", "$1$2");
}
}
}

View File

@ -1,88 +1,275 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import java.awt.Graphics;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
@Slf4j
@Service
@RequiredArgsConstructor
public class PdfSegmentationService {
private final static int MAX_PAGES_BEFORE_GC = 250;
private final RulingCleaningService rulingCleaningService;
private final TableExtractionService tableExtractionService;
private final BlockificationService blockificationService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
private final ImageClassificationService imageClassificationService;
public Document parseDocument(PDDocument pdDocument) throws IOException {
public Document parseDocument(InputStream documentInputStream) throws IOException {
return parseDocument(documentInputStream, false);
}
Document document = new Document();
public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException {
PDDocument pdDocument = null;
try {
//create tempFile
File tempFile = File.createTempFile("document", ".pdf");
IOUtils.copy(documentInputStream, new FileOutputStream(tempFile));
List<Page> pages = new ArrayList<>();
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
for (int pageNumber = 1; pageNumber <= pdDocument.getNumberOfPages(); pageNumber++) {
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument);
// initialize required variables
Document document = new Document();
List<Page> pages = new ArrayList<>();
PDRectangle pdr = pdPage.getMediaBox();
boolean isLandscape = pdr.getWidth() > pdr.getHeight();
int rotation = pdPage.getRotation();
boolean isRotated = rotation != 0 && rotation != 360;
pdDocument = reinitializePDDocument(tempFile, null);
long pageCount = pdDocument.getNumberOfPages();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper
.getMaxCharHeight());
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());
page.setRotation(rotation);
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
pdDocument = reinitializePDDocument(tempFile, pdDocument);
}
tableExtractionService.extractTables(cleanRulings, page);
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument);
buildPageStatistics(page);
PDRectangle pdr = pdPage.getMediaBox();
boolean isLandscape = pdr.getWidth() > pdr.getHeight();
page.setLandscape(isLandscape || isRotated);
int rotation = pdPage.getRotation();
boolean isRotated = rotation != 0 && rotation != 360;
page.setPageNumber(pageNumber);
increaseDocumentStatistics(page, document);
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper
.getMaxCharHeight());
page.setImages(stripper.getImages());
pages.add(page);
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());
page.setRotation(rotation);
page.setLandscape(isLandscape || isRotated);
page.setPageNumber(pageNumber);
List<PdfImage> mergedList = processImages(stripper.getImages());
page.setImages(mergedList);
tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page);
increaseDocumentStatistics(page, document);
if (!ignoreImages) {
imageClassificationService.classifyImages(page);
}
pages.add(page);
}
document.setPages(pages);
classificationService.classifyDocument(document);
sectionsBuilderService.buildSections(document);
sectionsBuilderService.addImagesToSections(document);
pdDocument = reinitializePDDocument(tempFile, pdDocument);
IOUtils.close(pdDocument);
tempFile.delete();
return document;
} finally {
if (pdDocument != null) {
pdDocument.close();
}
}
}
private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException {
if (pdDocument != null) {
pdDocument.close();
}
System.runFinalization();
System.gc();
MemoryStats.printMemoryStats();
var newPDDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupTempFileOnly());
newPDDocument.setAllSecurityToBeRemoved(true);
return newPDDocument;
}
//merge images, if they are separated during pdf import, return new list of Pdfimages
private List<PdfImage> processImages(List<PdfImage> imageList) {
if (imageList.size() > 1) {
List<PdfImage> mergedList = new ArrayList<>();
int countElementsInList = 0;
boolean beginImage = true;
// a List of Boolean, true = candidate for merging, false = no merging
List<Boolean> candidatesList = getCandidatesList(imageList);
// loop through list, if there are candidates for merging (true), merge images and add it to mergedList
for (int i = 0; i < candidatesList.size(); i++) {
if (candidatesList.get(i)) {
if (beginImage) {
//begin of image, merge two parts of imageList
PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1));
// image merge successful
if (mergedImage != null) {
mergedList.add(mergedImage);
countElementsInList++;
}
} else {
//middle of an image, merge current piece auf mergedList with image of imageList
PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1));
// image merge successful
if (mergedImage != null) {
mergedList.set(countElementsInList - 1, mergedImage);
}
}
beginImage = false;
} else {
// if the last candidate is false, then both images i and i+1 must be added
if (i == candidatesList.size() - 1) {
if (countElementsInList > 0 && mergedList.get(countElementsInList - 1) == imageList.get(i)) {
mergedList.add(imageList.get(i + 1));
} else {
mergedList.add(imageList.get(i));
mergedList.add(imageList.get(i + 1));
}
} else {
//first image is not splitted, add i to resultlist
if (beginImage) {
mergedList.add(imageList.get(i));
countElementsInList++;
} else {
// i is the end of an image, add begin of new image
mergedList.add(imageList.get(i + 1));
countElementsInList++;
beginImage = false;
}
}
}
}
return mergedList;
} else {
return imageList;
}
}
private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2) {
// diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten
double width = image1.getPosition().getWidth();
double height1 = image1.getPosition().getHeight();
double height2 = image2.getPosition().getHeight();
// mit den Werten, die unter Image gespeichert sind, funktioniert es
double img1height = image1.getImage().getHeight();
double img1width = image1.getImage().getWidth();
double img2height = image2.getImage().getHeight();
BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height + img2height), BufferedImage.TYPE_INT_RGB);
Graphics mergedImageGraphics = mergedImage.getGraphics();
try {
mergedImageGraphics.drawImage(image1.getImage(), 0, 0, null);
mergedImageGraphics.drawImage(image2.getImage(), 0, (int) (img1height), null);
// set Image, Position and type for merged Image
//set position for merged image with values of image1 and the height of both
Rectangle2D pos = new Rectangle2D.Float();
pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), width, height1 + height2);
PdfImage newPdfImage = new PdfImage(mergedImage, pos, image1.getPage());
// Graphics need to be disposed
image1.getImage().flush();
image2.getImage().flush();
mergedImage.flush();
mergedImageGraphics.dispose();
return newPdfImage;
} catch (Exception e) {
// failed to merge image
log.error("Failed to merge image", e);
return null;
}
document.setPages(pages);
classificationService.classifyDocument(document);
sectionsBuilderService.buildSections(document);
sectionsBuilderService.addImagesToSections(document);
}
return document;
//make a list of true and false, if the image is a candidate for merging
private List<Boolean> getCandidatesList(List<PdfImage> imageList) {
List<Boolean> candidatesList = new ArrayList<>();
for (int i = 0; i < imageList.size(); i++) {
if (i >= 1) {
candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i)));
}
}
return candidatesList;
}
// evaluate if two images are candidates for merging, depending on their coordinates, width and height
private boolean isCandidateForMerging(PdfImage image1, PdfImage image2) {
double x1 = image1.getPosition().getX();
double y1 = image1.getPosition().getY();
double width1 = image1.getPosition().getWidth();
double x2 = image2.getPosition().getX();
double y2 = image2.getPosition().getY();
double width2 = image2.getPosition().getWidth();
double height2 = image2.getPosition().getHeight();
//if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates,
// then it is the same picture and has to be merged -> return true
return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(y1 - y2) && width2 > (height2 / 6);
}
@ -116,4 +303,5 @@ public class PdfSegmentationService {
}
}
}

View File

@ -1,29 +1,15 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.classification.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.stream.Collectors;
@Service
public class SectionsBuilderService {
@ -121,6 +107,20 @@ public class SectionsBuilderService {
}
}
if (paragraphMap.isEmpty()) {
Paragraph paragraph = new Paragraph();
document.getParagraphs().add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph);
}
// first page is always a paragraph, else we can't process pages 1..N,
// where N is the first found page with a paragraph
if (paragraphMap.get(1) == null) {
Paragraph paragraph = new Paragraph();
document.getParagraphs().add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph);
}
for (Page page : document.getPages()) {
for (PdfImage image : page.getImages()) {
SortedSet<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
@ -296,4 +296,4 @@ public class SectionsBuilderService {
}
}
}

View File

@ -1,17 +1,16 @@
package com.iqser.red.service.redaction.v1.server.settings;
import org.springframework.boot.context.properties.ConfigurationProperties;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
@Data
@ConfigurationProperties("redaction-service")
public class RedactionServiceSettings {
private int numberOfSurroundingWords = 3;
private int surroundingWordsOffsetWindow = 100;
private boolean enableImageClassification = true;
}
}

View File

@ -0,0 +1,103 @@
package com.iqser.red.service.redaction.v1.server.storage;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.core.io.InputStreamResource;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.io.InputStream;
@Slf4j
@Service
@RequiredArgsConstructor
public class RedactionStorageService {
private final ObjectMapper objectMapper;
private final StorageService storageService;
@SneakyThrows
public InputStream getStoredObject(String storageId) {
return storageService.getObject(storageId).getInputStream();
}
@SneakyThrows
public void storeObject(String projectId, String fileId, FileType fileType, Object any) {
storageService.storeObject(StorageIdUtils.getStorageId(projectId, fileId, fileType), objectMapper.writeValueAsBytes(any));
}
public RedactionLog getRedactionLog(String projectId, String fileId) {
InputStreamResource inputStreamResource;
try {
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.REDACTION_LOG));
} catch (StorageObjectDoesNotExist e) {
log.debug("Text not available.");
return null;
}
try {
return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class);
} catch (IOException e) {
throw new RuntimeException("Could not convert RedactionLog", e);
}
}
public Text getText(String projectId, String fileId) {
InputStreamResource inputStreamResource;
try {
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.TEXT));
} catch (StorageObjectDoesNotExist e) {
log.debug("Text not available.");
return null;
}
try {
return objectMapper.readValue(inputStreamResource.getInputStream(), Text.class);
} catch (IOException e) {
throw new RuntimeException("Could not convert Text", e);
}
}
public SectionGrid getSectionGrid(String projectId, String fileId) {
var sectionGrid = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.SECTION_GRID));
try {
return objectMapper.readValue(sectionGrid.getInputStream(), SectionGrid.class);
} catch (IOException e) {
throw new RuntimeException("Could not convert RedactionLog", e);
}
}
@RequiredArgsConstructor
public enum StorageType {
PARSED_DOCUMENT(".json");
@Getter
private final String extension;
}
public static class StorageIdUtils {
public static String getStorageId(String projectId, String fileId, FileType fileType) {
return projectId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
}
}
}

View File

@ -1,7 +1,7 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -25,15 +25,17 @@ public abstract class AbstractTextContainer {
}
public boolean contains(Rectangle other) {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
@JsonIgnore
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
public float getWidth() {
return maxX - minX;
}
}
}

View File

@ -1,18 +1,17 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@SuppressWarnings("serial")
@Data
@EqualsAndHashCode(callSuper = true)
@ -71,7 +70,4 @@ public class Cell extends Rectangle {
}
}
}

View File

@ -1,10 +1,10 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import java.util.List;
import lombok.Builder;
import lombok.Data;
import java.util.List;
@Data
@Builder
public class CleanRulings {

View File

@ -8,170 +8,171 @@ import java.util.List;
@SuppressWarnings("all")
public class Rectangle extends Rectangle2D.Float {
/**
* Ill-defined comparator, from when Rectangle was Comparable.
*
* see https://github.com/tabulapdf/tabula-java/issues/116
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) return 0;
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
? - java.lang.Double.compare(o1.getX(), o2.getX())
: java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
/**
* Ill-defined comparator, from when Rectangle was Comparable.
* <p>
* see https://github.com/tabulapdf/tabula-java/issues/116
*
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override
public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) return 0;
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
? -java.lang.Double.compare(o1.getX(), o2.getX())
: java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
public Rectangle() {
super();
}
public Rectangle() {
super();
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
public float getArea() {
return this.width * this.height;
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float getArea() {
return this.width * this.height;
}
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
&& other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
&& this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
&& other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
&& this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
return rv;
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
}
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
&& other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
&& this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
&& other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
&& this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0,
Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0,
Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
return rv;
return (float) (intersectionArea / unionArea);
}
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0,
Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0,
Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
public float getTop() {
return (float) this.getMinY();
}
return (float) (intersectionArea / unionArea);
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float getRight() {
return (float) this.getMaxX();
}
public float getTop() {
return (float) this.getMinY();
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public float getLeft() {
return (float) this.getMinX();
}
public float getRight() {
return (float) this.getMaxX();
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public float getBottom() {
return (float) this.getMaxY();
}
public float getLeft() {
return (float) this.getMinX();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public Point2D[] getPoints() {
return new Point2D[] { new Point2D.Float(this.getLeft(), this.getTop()),
new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
new Point2D.Float(this.getLeft(), this.getBottom()) };
}
public float getBottom() {
return (float) this.getMaxY();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
public Point2D[] getPoints() {
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()),
new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
new Point2D.Float(this.getLeft(), this.getBottom())};
}
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
}

View File

@ -1,12 +1,11 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import org.locationtech.jts.geom.Envelope;
import org.locationtech.jts.index.strtree.STRtree;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import java.util.ArrayList;
import java.util.List;
@SuppressWarnings("all")
public class RectangleSpatialIndex<T extends Rectangle> {

View File

@ -1,20 +1,13 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.extern.slf4j.Slf4j;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.extern.slf4j.Slf4j;
import java.util.*;
@Slf4j
@ -23,13 +16,127 @@ public class Ruling extends Line2D.Float {
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
private enum SOType {VERTICAL, HRIGHT, HLEFT}
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
}
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
ArrayList<Ruling> rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
}
}
return rv;
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (Utils.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i,
new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
}
public boolean vertical() {
return this.length() > 0 && Utils.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
@ -38,13 +145,13 @@ public class Ruling extends Line2D.Float {
return this.length() > 0 && Utils.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
// attributes that make sense only for non-oblique lines
// these are used to have a single collapse method (in page, currently)
public boolean oblique() {
return !(this.vertical() || this.horizontal());
}
// attributes that make sense only for non-oblique lines
// these are used to have a single collapse method (in page, currently)
public float getPosition() {
if (this.oblique()) {
throw new UnsupportedOperationException();
@ -52,7 +159,6 @@ public class Ruling extends Line2D.Float {
return this.vertical() ? this.getLeft() : this.getTop();
}
public float getStart() {
if (this.oblique()) {
throw new UnsupportedOperationException();
@ -102,12 +208,10 @@ public class Ruling extends Line2D.Float {
}
}
public boolean perpendicularTo(Ruling other) {
return this.vertical() == other.horizontal();
}
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
if (this.intersectsLine(another)) {
return true;
@ -238,7 +342,6 @@ public class Ruling extends Line2D.Float {
return angle;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
@ -248,122 +351,7 @@ public class Ruling extends Line2D.Float {
return rv;
}
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
ArrayList<Ruling> rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
}
}
return rv;
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (Utils.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i,
new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
} catch(UnsupportedOperationException e){
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
}
private enum SOType {VERTICAL, HRIGHT, HLEFT}
}

View File

@ -1,22 +1,13 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.collections4.CollectionUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import java.util.*;
@Slf4j
public class Table extends AbstractTextContainer {
@ -24,21 +15,14 @@ public class Table extends AbstractTextContainer {
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
private final RectangleSpatialIndex<Cell> si = new RectangleSpatialIndex<>();
private final int rotation;
@Getter
@Setter
private String headline;
private int unrotatedRowCount;
private int unrotatedColCount;
private int rowCount = -1;
private int colCount = -1;
private final int rotation;
private List<List<Cell>> rows;
@ -62,8 +46,8 @@ public class Table extends AbstractTextContainer {
// Ignore rows that does not contain any cells and values.
List<List<Cell>> rowsToRemove = new ArrayList<>();
for (List<Cell> row: rows){
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()){
for (List<Cell> row : rows) {
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
rowsToRemove.add(row);
}
}
@ -110,7 +94,7 @@ public class Table extends AbstractTextContainer {
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<Cell> rowCells = rows.get(rowIndex);
if(rowCells.size() == 1){
if (rowCells.size() == 1) {
continue;
}
@ -275,7 +259,7 @@ public class Table extends AbstractTextContainer {
cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2),
Utils.round(arg1
.getBottom(), 2))));
.getBottom(), 2))));
Iterator<Cell> iter = cells.iterator();
Cell c = iter.next();
@ -367,4 +351,4 @@ public class Table extends AbstractTextContainer {
return sb.toString();
}
}
}

View File

@ -1,19 +1,13 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import org.springframework.stereotype.Service;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.util.*;
@Service
public class RulingCleaningService {

View File

@ -1,31 +1,57 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.*;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import org.springframework.stereotype.Service;
import java.awt.geom.Point2D;
import java.util.*;
import java.util.stream.Collectors;
@Service
public class TableExtractionService {
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
} else if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
};
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
} else if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
}
return rv;
};
public void extractTables(CleanRulings cleanRulings, Page page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
@ -80,7 +106,6 @@ public class TableExtractionService {
page.getTextBlocks().removeAll(toBeRemoved);
}
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<Cell> cellsFound = new ArrayList<>();
@ -133,7 +158,6 @@ public class TableExtractionService {
return cellsFound;
}
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<>();
@ -233,47 +257,6 @@ public class TableExtractionService {
return rectangles;
}
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
} else if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
};
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
} else if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
}
return rv;
};
private enum Direction {
HORIZONTAL, VERTICAL
}

View File

@ -19,21 +19,24 @@ import java.awt.geom.Rectangle2D;
* clipping algorithm (line against clip rectangle).
*/
@SuppressWarnings("all")
public final class CohenSutherlandClipping
{
public final class CohenSutherlandClipping {
private static final int INSIDE = 0;
private static final int LEFT = 1;
private static final int RIGHT = 2;
private static final int BOTTOM = 4;
private static final int TOP = 8;
private double xMin;
private double yMin;
private double xMax;
private double yMax;
/**
* Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0).
*/
public CohenSutherlandClipping() {
}
/**
* Creates a Cohen Sutherland clipper with the given clip rectangle.
*
* @param clip the clip rectangle to use
*/
public CohenSutherlandClipping(Rectangle2D clip) {
@ -42,6 +45,7 @@ public final class CohenSutherlandClipping
/**
* Sets the clip rectangle.
*
* @param clip the clip rectangle
*/
public void setClip(Rectangle2D clip) {
@ -51,19 +55,13 @@ public final class CohenSutherlandClipping
yMax = yMin + clip.getHeight();
}
private static final int INSIDE = 0;
private static final int LEFT = 1;
private static final int RIGHT = 2;
private static final int BOTTOM = 4;
private static final int TOP = 8;
private final int regionCode(double x, double y) {
int code = x < xMin
? LEFT
: x > xMax
int code = x < xMin
? LEFT
: x > xMax
? RIGHT
: INSIDE;
if (y < yMin) code |= BOTTOM;
if (y < yMin) code |= BOTTOM;
else if (y > yMax) code |= TOP;
return code;
}
@ -71,6 +69,7 @@ public final class CohenSutherlandClipping
/**
* Clips a given line against the clip rectangle.
* The modification (if needed) is done in place.
*
* @param line the line to clip
* @return true if line is clipped, false if line is
* totally outside the clip rect.
@ -87,9 +86,9 @@ public final class CohenSutherlandClipping
boolean vertical = p1x == p2x;
double slope = vertical
? 0d
: (p2y-p1y)/(p2x-p1x);
double slope = vertical
? 0d
: (p2y - p1y) / (p2x - p1x);
int c1 = regionCode(p1x, p1y);
int c2 = regionCode(p2x, p2y);
@ -103,31 +102,27 @@ public final class CohenSutherlandClipping
if ((c & LEFT) != INSIDE) {
qx = xMin;
qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
}
else if ((c & RIGHT) != INSIDE) {
qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
} else if ((c & RIGHT) != INSIDE) {
qx = xMax;
qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
}
else if ((c & BOTTOM) != INSIDE) {
qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
} else if ((c & BOTTOM) != INSIDE) {
qy = yMin;
qx = vertical
? p1x
: (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
}
else if ((c & TOP) != INSIDE) {
? p1x
: (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
} else if ((c & TOP) != INSIDE) {
qy = yMax;
qx = vertical
? p1x
: (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
? p1x
: (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
}
if (c == c1) {
p1x = qx;
p1y = qy;
c1 = regionCode(p1x, p1y);
}
else {
c1 = regionCode(p1x, p1y);
} else {
p2x = qx;
p2y = qy;
c2 = regionCode(p2x, p2y);
@ -137,4 +132,4 @@ public final class CohenSutherlandClipping
return true;
}
}
// end of file
// end of file

View File

@ -10,11 +10,6 @@ import java.util.List;
*/
public final class QuickSort {
private QuickSort() {
}
private static final Comparator<? extends Comparable> OBJCOMP = new Comparator<Comparable>() {
@Override
public int compare(Comparable object1, Comparable object2) {
@ -24,6 +19,10 @@ public final class QuickSort {
};
private QuickSort() {
}
/**
* Sorts the given list using the given comparator.
*

View File

@ -1,11 +1,11 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
import lombok.extern.slf4j.Slf4j;
import java.math.BigDecimal;
import java.util.Comparator;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@SuppressWarnings("all")
public class Utils {

View File

@ -1,15 +1,5 @@
package com.iqser.red.service.redaction.v1.server.visualization.service;
import java.awt.Color;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
@ -17,9 +7,17 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.springframework.stereotype.Service;
import java.awt.Color;
import java.io.IOException;
import java.util.List;
@Slf4j
@Service
@ -34,7 +32,7 @@ public class PdfVisualisationService {
PDPage pdPage = document.getPage(page - 1);
PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
for(Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
@ -44,10 +42,10 @@ public class PdfVisualisationService {
continue;
}
if (textBlock instanceof TextBlock) {
textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size());
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
visualizeTextBlock((TextBlock) textBlock, contentStream);
} else if (textBlock instanceof Table) {
textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size());
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
visualizeTable((Table) textBlock, contentStream);
}
@ -59,7 +57,6 @@ public class PdfVisualisationService {
}
public void visualizeClassifications(Document classifiedDoc, PDDocument document) throws IOException {
for (int page = 1; page <= document.getNumberOfPages(); page++) {

View File

@ -1,4 +1,11 @@
server:
port: 8083
configuration-service.url: "http://localhost:8081"
configuration-service.url: "http://localhost:8081"
file-management-service.url: "http://localhost:8085"
storage:
bucket-name: 'redaction'
endpoint: 'http://localhost:9000'
key: minioadmin
secret: minioadmin

View File

@ -2,6 +2,7 @@ info:
description: Redaction Service Server V1
configuration-service.url: "http://configuration-service-v1:8080"
file-management-service.url: "http://file-management-service-v1:8080"
image-service.url: "http://image-service-v1:8080"
server:
@ -10,6 +11,20 @@ server:
spring:
profiles:
active: kubernetes
rabbitmq:
host: ${RABBITMQ_HOST:localhost}
port: ${RABBITMQ_PORT:5672}
username: ${RABBITMQ_USERNAME:user}
password: ${RABBITMQ_PASSWORD:rabbitmq}
listener:
simple:
acknowledge-mode: AUTO
concurrency: 2
retry:
enabled: true
max-attempts: 3
max-interval: 15000
prefetch: 1
management:
endpoint:
@ -17,4 +32,11 @@ management:
prometheus.enabled: ${monitoring.enabled:false}
health.enabled: true
endpoints.web.exposure.include: prometheus, health
metrics.export.prometheus.enabled: ${monitoring.enabled:false}
metrics.export.prometheus.enabled: ${monitoring.enabled:false}
storage:
signer-type: 'AWSS3V4SignerType'
bucket-name: 'redaction'
region: 'us-east-1'
endpoint: 'https://s3.amazonaws.com'

View File

@ -0,0 +1,51 @@
package com.iqser.red.service.redaction.v1.server;
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows;
import org.apache.commons.io.IOUtils;
import org.springframework.core.io.InputStreamResource;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.HashMap;
import java.util.Map;
public class FileSystemBackedStorageService extends StorageService {
private final Map<String, File> dataMap = new HashMap<>();
public FileSystemBackedStorageService() {
super(null, null);
}
@SneakyThrows
@Override
public InputStreamResource getObject(String objectId) {
var res = dataMap.get(objectId);
if (res == null) {
throw new StorageObjectDoesNotExist(new RuntimeException());
}
return new InputStreamResource(new FileInputStream(res));
}
@SneakyThrows
@Override
public void storeObject(String objectId, byte[] data) {
File tempFile = File.createTempFile("test", ".tmp");
IOUtils.write(data, new FileOutputStream(tempFile));
dataMap.put(objectId, tempFile);
}
public void clearStorage() {
this.dataMap.forEach((k, v) -> {
v.delete();
});
this.dataMap.clear();
}
}

View File

@ -1,30 +1,27 @@
package com.iqser.red.service.redaction.v1.server;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
import com.amazonaws.services.s3.AmazonS3;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.configuration.v1.api.model.*;
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows;
import org.apache.commons.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.KieServices;
@ -32,48 +29,32 @@ import org.kie.api.builder.KieBuilder;
import org.kie.api.builder.KieFileSystem;
import org.kie.api.builder.KieModule;
import org.kie.api.runtime.KieContainer;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.context.TestConfiguration;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.model.Status;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import java.io.*;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.*;
import java.util.stream.Collectors;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
@RunWith(SpringRunner.class)
@SpringBootTest(webEnvironment = RANDOM_PORT)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class)
public class RedactionIntegrationTest {
private static final String RULES = loadFromClassPath("drools/rules.drl");
@ -93,6 +74,7 @@ public class RedactionIntegrationTest {
private static final String SIGNATURE = "signature";
private static final String FORMULA = "formula";
private static final String OCR = "ocr";
private static final String DOSSIER_REDACTIONS = "dossier_redactions";
private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author";
private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address";
@ -101,9 +83,13 @@ public class RedactionIntegrationTest {
private static final String PII = "PII";
@Autowired
private RedactionController redactionController;
@Autowired
private ReanalyzeService reanalyzeService;
@Autowired
private ObjectMapper objectMapper;
@ -116,7 +102,20 @@ public class RedactionIntegrationTest {
@MockBean
private ImageClassificationClient imageClassificationClient;
@Autowired
private RedactionStorageService redactionStorageService;
@Autowired
private StorageService storageService;
@MockBean
private AmazonS3 amazonS3;
@MockBean
private RabbitTemplate rabbitTemplate;
private final Map<String, List<String>> dictionary = new HashMap<>();
private final Map<String, List<String>> dossierDictionary = new HashMap<>();
private final Map<String, String> typeColorMap = new HashMap<>();
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
private final Map<String, Boolean> caseInSensitiveMap = new HashMap<>();
@ -126,8 +125,11 @@ public class RedactionIntegrationTest {
private final Map<String, Long> reanlysisVersions = new HashMap<>();
private final static String TEST_RULESET_ID = "123";
private final static String TEST_PROJECT_ID = "123";
private final static String TEST_FILE_ID = "123";
@TestConfiguration
@Configuration
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
public static class RedactionIntegrationTestConfiguration {
@Bean
@ -146,6 +148,21 @@ public class RedactionIntegrationTest {
return kieServices.newKieContainer(kieModule.getReleaseId());
}
@Bean
@Primary
public StorageService inmemoryStorage() {
return new FileSystemBackedStorageService();
}
}
@After
public void cleanupStorage() {
if (this.storageService instanceof FileSystemBackedStorageService) {
((FileSystemBackedStorageService) this.storageService).clearStorage();
}
}
@ -158,30 +175,45 @@ public class RedactionIntegrationTest {
loadDictionaryForTest();
loadTypeForTest();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(0L);
when(dictionaryClient.getAllTypes(TEST_RULESET_ID)).thenReturn(TypeResponse.builder()
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(0L);
when(dictionaryClient.getAllTypes(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(TypeResponse.builder()
.types(getTypeResponse())
.build());
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(VERTEBRATE));
when(dictionaryClient.getDictionaryForType(ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(ADDRESS));
when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(AUTHOR));
when(dictionaryClient.getDictionaryForType(SPONSOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(SPONSOR));
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR));
when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR));
when(dictionaryClient.getDictionaryForType(HINT_ONLY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(HINT_ONLY));
when(dictionaryClient.getDictionaryForType(MUST_REDACT, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(MUST_REDACT));
when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION));
when(dictionaryClient.getDictionaryForType(TEST_METHOD, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(TEST_METHOD));
when(dictionaryClient.getDictionaryForType(PII, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PII));
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR));
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS));
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE));
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY));
when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(IMAGE));
when(dictionaryClient.getDictionaryForType(OCR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(OCR));
when(dictionaryClient.getDictionaryForType(LOGO, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(LOGO));
when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(SIGNATURE));
when(dictionaryClient.getDictionaryForType(FORMULA, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FORMULA));
when(dictionaryClient.getVersion(TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(0L);
when(dictionaryClient.getAllTypes(TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(TypeResponse.builder()
.types(List.of(TypeResult.builder()
.type(DOSSIER_REDACTIONS)
.ruleSetId(TEST_RULESET_ID)
.hexColor( "#ffe187")
.isHint(hintTypeMap.get(DOSSIER_REDACTIONS))
.isCaseInsensitive(caseInSensitiveMap.get(DOSSIER_REDACTIONS))
.isRecommendation(recommendationTypeMap.get(DOSSIER_REDACTIONS))
.rank(rankTypeMap.get(DOSSIER_REDACTIONS))
.build()))
.build());
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false));
when(dictionaryClient.getDictionaryForType(ADDRESS, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(ADDRESS, false));
when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(AUTHOR, false));
when(dictionaryClient.getDictionaryForType(SPONSOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SPONSOR, false));
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR, false));
when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR, false));
when(dictionaryClient.getDictionaryForType(HINT_ONLY, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(HINT_ONLY, false));
when(dictionaryClient.getDictionaryForType(MUST_REDACT, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(MUST_REDACT, false));
when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION, false));
when(dictionaryClient.getDictionaryForType(TEST_METHOD, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(TEST_METHOD, false));
when(dictionaryClient.getDictionaryForType(PII, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PII, false));
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR, false));
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS, false));
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FALSE_POSITIVE, false));
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PURITY, false));
when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(IMAGE, false));
when(dictionaryClient.getDictionaryForType(OCR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(OCR, false));
when(dictionaryClient.getDictionaryForType(LOGO, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(LOGO, false));
when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SIGNATURE, false));
when(dictionaryClient.getDictionaryForType(FORMULA, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FORMULA, false));
when(dictionaryClient.getDictionaryForType(DOSSIER_REDACTIONS, TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(getDictionaryResponse(DOSSIER_REDACTIONS, true));
when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors);
}
@ -288,6 +320,11 @@ public class RedactionIntegrationTest {
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dossierDictionary.computeIfAbsent(DOSSIER_REDACTIONS, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/dossier_redactions.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
}
@ -340,6 +377,7 @@ public class RedactionIntegrationTest {
hintTypeMap.put(FORMULA, false);
hintTypeMap.put(LOGO, false);
hintTypeMap.put(SIGNATURE, false);
hintTypeMap.put(DOSSIER_REDACTIONS, false);
caseInSensitiveMap.put(VERTEBRATE, true);
caseInSensitiveMap.put(ADDRESS, false);
@ -361,6 +399,7 @@ public class RedactionIntegrationTest {
caseInSensitiveMap.put(SIGNATURE, true);
caseInSensitiveMap.put(LOGO, true);
caseInSensitiveMap.put(FORMULA, true);
caseInSensitiveMap.put(DOSSIER_REDACTIONS, false);
recommendationTypeMap.put(VERTEBRATE, false);
recommendationTypeMap.put(ADDRESS, false);
@ -382,6 +421,7 @@ public class RedactionIntegrationTest {
recommendationTypeMap.put(FORMULA, false);
recommendationTypeMap.put(SIGNATURE, false);
recommendationTypeMap.put(LOGO, false);
recommendationTypeMap.put(DOSSIER_REDACTIONS, false);
rankTypeMap.put(FALSE_POSITIVE, 160);
rankTypeMap.put(PURITY, 155);
@ -403,6 +443,7 @@ public class RedactionIntegrationTest {
rankTypeMap.put(LOGO, 28);
rankTypeMap.put(SIGNATURE, 27);
rankTypeMap.put(FORMULA, 26);
rankTypeMap.put(DOSSIER_REDACTIONS, 200);
colors.setDefaultColor("#acfc00");
colors.setNotRedacted("#cccccc");
@ -429,11 +470,11 @@ public class RedactionIntegrationTest {
}
private DictionaryResponse getDictionaryResponse(String type) {
private DictionaryResponse getDictionaryResponse(String type, boolean isDossierDictionary) {
return DictionaryResponse.builder()
.hexColor(typeColorMap.get(type))
.entries(toDictionaryEntry(dictionary.get(type)))
.entries(isDossierDictionary ? toDictionaryEntry(dossierDictionary.get(type)) : toDictionaryEntry(dictionary.get(type)))
.isHint(hintTypeMap.get(type))
.isCaseInsensitive(caseInSensitiveMap.get(type))
.isRecommendation(recommendationTypeMap.get(type))
@ -453,6 +494,71 @@ public class RedactionIntegrationTest {
@Test
public void test270Rotated() {
AnalyzeRequest request = prepareStorage("files/Minimal Examples/270Rotated.pdf");
MemoryStats.printMemoryStats();
AnalyzeResult result = reanalyzeService.analyze(request);
assertThat(result).isNotNull();
}
@Test
@Ignore
public void testLargeScannedFileOOM() {
AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf");
MemoryStats.printMemoryStats();
AnalyzeResult result = reanalyzeService.analyze(request);
assertThat(result).isNotNull();
}
@Test
public void testMergedImages() throws IOException {
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/merge_images.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request);
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> {
duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry);
});
duplicates.entrySet().forEach(entry -> {
assertThat(entry.getValue().size()).isEqualTo(1);
});
dictionary.get(AUTHOR).add("Drinking water");
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(1L);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated3.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
long rstart = System.currentTimeMillis();
reanalyzeService.reanalyze(request);
long rend = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (rend - rstart));
long end = System.currentTimeMillis();
System.out.println("duration: " + (end - start));
}
@Test
@Ignore
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
long start = System.currentTimeMillis();
@ -465,15 +571,16 @@ public class RedactionIntegrationTest {
input.addAll(getPathsRecursively(file));
}
for (File path : input) {
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(new FileInputStream(path)))
.build();
AnalyzeRequest request = prepareStorage(new FileInputStream((path)));
System.out.println("Redacting file : " + path.getName());
AnalyzeResult result = redactionController.analyze(request);
AnalyzeResult result = reanalyzeService.analyze(request);
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> {
duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry);
});
@ -482,16 +589,10 @@ public class RedactionIntegrationTest {
});
dictionary.get(AUTHOR).add("Drinking water");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L);
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(1L);
long rstart = System.currentTimeMillis();
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
.redactionLog(result.getRedactionLog())
.document(IOUtils.toByteArray(new FileInputStream(path)))
.manualRedactions(null)
.text(result.getText())
.ruleSetId(TEST_RULESET_ID)
.build());
reanalyzeService.reanalyze(request);
long rend = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (rend - rstart));
@ -526,18 +627,16 @@ public class RedactionIntegrationTest {
@Test
public void redactionTest() throws IOException {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
AnalyzeResult result = reanalyzeService.analyze(request);
AnalyzeResult result = redactionController.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
var text = redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID);
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
redactionLog.getRedactionLogEntry().forEach(entry -> {
if (entry.isImage()) {
System.out.println("---->" + entry.getType());
}
@ -548,13 +647,13 @@ public class RedactionIntegrationTest {
System.out.println("first analysis duration: " + (end - start));
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) {
fileOutputStream.write(objectMapper.writeValueAsBytes(result.getText()));
fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID)));
}
int correctFound = 0;
loop:
for (RedactionLogEntry redactionLogEntry : result.getRedactionLog().getRedactionLogEntry()) {
for (SectionText sectionText : result.getText().getSectionTexts()) {
for (RedactionLogEntry redactionLogEntry : redactionLog.getRedactionLogEntry()) {
for (SectionText sectionText : text.getSectionTexts()) {
if (redactionLogEntry.isImage()) {
correctFound++;
continue loop;
@ -570,7 +669,7 @@ public class RedactionIntegrationTest {
}
}
}
assertThat(correctFound).isEqualTo(result.getRedactionLog().getRedactionLogEntry().size());
assertThat(correctFound).isEqualTo(redactionLog.getRedactionLogEntry().size());
dictionary.get(AUTHOR).add("properties");
reanlysisVersions.put("properties", 1L);
@ -581,25 +680,19 @@ public class RedactionIntegrationTest {
dictionary.get(VERTEBRATE).add("s-metolachlor");
reanlysisVersions.put("s-metolachlor", 3L);
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(3L);
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(3L);
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(VERTEBRATE));
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false));
start = System.currentTimeMillis();
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
.redactionLog(result.getRedactionLog())
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.text(result.getText())
.ruleSetId(TEST_RULESET_ID)
.build());
AnalyzeResult reanalyzeResult = reanalyzeService.reanalyze(request);
end = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (end - start));
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(reanalyzeResult.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
@ -614,19 +707,13 @@ public class RedactionIntegrationTest {
System.out.println("testTableRedaction");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
AnalyzeResult result = redactionController.analyze(request);
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
AnalyzeResult result = reanalyzeService.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(result.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
@ -681,13 +768,10 @@ public class RedactionIntegrationTest {
// manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.manualRedactions(manualRedactions)
.build();
AnalyzeResult result = redactionController.analyze(request);
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setManualRedactions(manualRedactions);
AnalyzeResult result = reanalyzeService.analyze(request);
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
manualRedactions.setIdsToRemove(Set.of(IdRemoval.builder()
@ -695,20 +779,15 @@ public class RedactionIntegrationTest {
.status(Status.APPROVED)
.build()));
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
.redactionLog(result.getRedactionLog())
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.manualRedactions(manualRedactions)
.text(result.getText())
.ruleSetId(TEST_RULESET_ID)
.build());
reanalyzeService.reanalyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(reanalyzeResult.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
@ -725,11 +804,16 @@ public class RedactionIntegrationTest {
System.out.println("classificationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId())
.fileId(request.getFileId())
.ruleSetId(request.getRuleSetId())
.build();
RedactionResult result = redactionController.classify(request);
RedactionResult result = redactionController.classify(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Classified.pdf")) {
fileOutputStream.write(result.getDocument());
@ -743,11 +827,15 @@ public class RedactionIntegrationTest {
System.out.println("sectionsTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId())
.fileId(request.getFileId())
.ruleSetId(request.getRuleSetId())
.build();
RedactionResult result = redactionController.sections(request);
RedactionResult result = redactionController.sections(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Sections.pdf")) {
fileOutputStream.write(result.getDocument());
@ -761,11 +849,15 @@ public class RedactionIntegrationTest {
System.out.println("htmlTablesTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId())
.fileId(request.getFileId())
.ruleSetId(request.getRuleSetId())
.build();
RedactionResult result = redactionController.htmlTables(request);
RedactionResult result = redactionController.htmlTables(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) {
fileOutputStream.write(result.getDocument());
@ -779,11 +871,15 @@ public class RedactionIntegrationTest {
System.out.println("htmlTableRotationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId())
.fileId(request.getFileId())
.ruleSetId(request.getRuleSetId())
.build();
RedactionResult result = redactionController.htmlTables(request);
RedactionResult result = redactionController.htmlTables(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) {
fileOutputStream.write(result.getDocument());
@ -796,20 +892,45 @@ public class RedactionIntegrationTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Phantom Cells.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = redactionController.analyze(request);
AnalyzeResult result = reanalyzeService.analyze(request);
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> {
if (!entry.isHint()) {
assertThat(entry.getReason()).isEqualTo("Not redacted because row is not a vertebrate study");
}
});
}
@SneakyThrows
private AnalyzeRequest prepareStorage(String file) {
ClassPathResource pdfFileResource = new ClassPathResource(file);
return prepareStorage(pdfFileResource.getInputStream());
}
@SneakyThrows
private AnalyzeRequest prepareStorage(InputStream stream) {
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.lastProcessed(OffsetDateTime.now())
.build();
var bytes = IOUtils.toByteArray(stream);
storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_PROJECT_ID, TEST_FILE_ID, FileType.ORIGIN), bytes);
return request;
}
@Test
public void sponsorCompanyTest() throws IOException {
@ -817,17 +938,14 @@ public class RedactionIntegrationTest {
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/sponsor_companies.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
AnalyzeResult result = redactionController.analyze(request);
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(result.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
@ -858,4 +976,4 @@ public class RedactionIntegrationTest {
}
}
}
}

View File

@ -1,12 +1,10 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.amazonaws.services.s3.AmazonS3;
import com.iqser.red.service.configuration.v1.api.model.*;
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
import com.iqser.red.service.redaction.v1.server.Application;
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
@ -14,8 +12,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.iqser.red.storage.commons.service.StorageService;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
@ -26,10 +23,14 @@ import org.kie.api.builder.KieFileSystem;
import org.kie.api.builder.KieModule;
import org.kie.api.runtime.KieContainer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.context.TestConfiguration;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
@ -40,21 +41,15 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
@SpringBootTest
@RunWith(SpringRunner.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(EntityRedactionServiceTest.RedactionIntegrationTestConfiguration.class)
public class EntityRedactionServiceTest {
private static final String DEFAULT_RULES = loadFromClassPath("drools/rules.drl");
@ -80,9 +75,13 @@ public class EntityRedactionServiceTest {
@Autowired
private DroolsExecutionService droolsExecutionService;
@MockBean
private AmazonS3 amazonS3;
private final static String TEST_RULESET_ID = "123";
@TestConfiguration
@Configuration
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
public static class RedactionIntegrationTestConfiguration {
@Bean
@ -101,6 +100,13 @@ public class EntityRedactionServiceTest {
return kieServices.newKieContainer(kieModule.getReleaseId());
}
@Bean
@Primary
public StorageService inmemoryStorage() {
return new FileSystemBackedStorageService();
}
}
@ -108,8 +114,8 @@ public class EntityRedactionServiceTest {
public void testNestedEntitiesRemoval() {
Set<Entity> entities = new HashSet<>();
Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false);
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false);
Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false, false);
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false, false);
entities.add(nested);
entities.add(nesting);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
@ -125,31 +131,25 @@ public class EntityRedactionServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
RedactionRequest redactionRequest = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@ -158,30 +158,24 @@ public class EntityRedactionServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/nested_redaction.pdf");
RedactionRequest redactionRequest = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@ -190,64 +184,58 @@ public class EntityRedactionServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Cyprodinil/40 Cyprodinil - EU AIR3 - LCA Section 1" +
" Supplement - Identity of the active substance - Reference list.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
}
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " +
"the plant protection product.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
}
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
}
@Test
public void testFalsePositiveInWrongCell() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 9)
.count()).isEqualTo(10);
}
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 9)
.count()).isEqualTo(10);
}
@ -296,27 +284,25 @@ public class EntityRedactionServiceTest {
droolsExecutionService.updateRules(TEST_RULESET_ID);
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 6)
.count()).isEqualTo(13);
}
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 6)
.count()).isEqualTo(13);
}
@ -337,27 +323,25 @@ public class EntityRedactionServiceTest {
droolsExecutionService.updateRules(TEST_RULESET_ID);
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/batches_new_line.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse authorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(authorResponse);
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(authorResponse);
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 11)
.count()).isEqualTo(1);
}
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 11)
.count()).isEqualTo(1);
}
@ -371,24 +355,22 @@ public class EntityRedactionServiceTest {
.entries(toDictionaryEntry(Arrays.asList("Bissig R.", "Thanei P.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
}
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf");
@ -396,20 +378,18 @@ public class EntityRedactionServiceTest {
.entries(toDictionaryEntry(Arrays.asList("Tribolet, R.", "Muir, G.", "Kühne-Thu, H.", "Close, C.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
}
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
}
@ -423,23 +403,21 @@ public class EntityRedactionServiceTest {
.entries(toDictionaryEntry(Collections.singletonList("Aldershof S.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
}
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
}
@ -476,19 +454,19 @@ public class EntityRedactionServiceTest {
TypeResult.builder().ruleSetId(TEST_RULESET_ID).type(ADDRESS_CODE).hexColor("#ff00ff").build(),
TypeResult.builder().ruleSetId(TEST_RULESET_ID).type(SPONSOR_CODE).hexColor("#00ffff").build()))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getAllTypes(TEST_RULESET_ID)).thenReturn(typeResponse);
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getAllTypes(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(typeResponse);
// Default empty return to prevent NPEs
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Colors colors = new Colors();
colors.setDefaultColor("#acfc00");
@ -518,7 +496,7 @@ public class EntityRedactionServiceTest {
}
}
private List<DictionaryEntry> toDictionaryEntry(List<String> entries){
private List<DictionaryEntry> toDictionaryEntry(List<String> entries) {
List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
entries.forEach(entry -> {
dictionaryEntries.add(new DictionaryEntry(entry, 1L, false));
@ -526,4 +504,4 @@ public class EntityRedactionServiceTest {
return dictionaryEntries;
}
}
}

View File

@ -1,7 +1,31 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import static org.assertj.core.api.Assertions.assertThat;
import com.amazonaws.services.s3.AmazonS3;
import com.iqser.red.service.redaction.v1.server.Application;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.runtime.KieContainer;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import javax.imageio.ImageIO;
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
@ -9,31 +33,12 @@ import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import javax.imageio.ImageIO;
import static org.assertj.core.api.Assertions.assertThat;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.runtime.KieContainer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
@SpringBootTest
@RunWith(SpringRunner.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(PdfSegmentationServiceTest.TestConfiguration.class)
public class PdfSegmentationServiceTest {
@Autowired
@ -51,6 +56,28 @@ public class PdfSegmentationServiceTest {
@MockBean
private KieContainer kieContainer;
@MockBean
private AmazonS3 amazonS3;
@MockBean
private RabbitTemplate rabbitTemplate;
@Configuration
@EnableAutoConfiguration(exclude = { RabbitAutoConfiguration.class})
public static class TestConfiguration {
}
@Test
public void testMergeImages() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/270Rotated.pdf");
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getPages().get(0).getImages().size()).isEqualTo(1);
assertThat(document.getPages().get(1).getImages().size()).isEqualTo(0);
}
@Test
@Ignore
@ -58,61 +85,78 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
int i = 0;
for (Page page : document.getPages()) {
for (PdfImage image : page.getImages()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
fileOutputStream.write(baos.toByteArray());
}
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
int i = 0;
for (Page page : document.getPages()) {
for (PdfImage image : page.getImages()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
fileOutputStream.write(baos.toByteArray());
}
i++;
}
i++;
}
}
}
@Test
public void testPDFSegmentationWithComplexTable() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table table = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
}
@Test
public void testTableExtraction() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
@ -121,38 +165,36 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(firstTable.getRowCount() - 1)
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(firstTable.getRowCount() - 1)
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
@ -161,38 +203,36 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
}

View File

@ -1,5 +1,6 @@
configuration-service.url: "http://configuration-service-v1:8080"
image-service.url: "http://image-service-v1:8080"
file-management-service.url: "http://file-management-service-v1:8080"
ribbon:
ConnectTimeout: 600000

Some files were not shown because too many files have changed in this diff Show More