Merge branch 'master' of ssh://git.iqser.com:2222/red/redaction-service into Test
Conflicts: redaction-service-v1/redaction-service-server-v1/pom.xml
This commit is contained in:
commit
a6f8ea0f92
@ -1,7 +1,5 @@
|
||||
package buildjob;
|
||||
|
||||
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
|
||||
|
||||
import com.atlassian.bamboo.specs.api.BambooSpec;
|
||||
import com.atlassian.bamboo.specs.api.builders.BambooKey;
|
||||
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
|
||||
@ -24,6 +22,8 @@ import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
|
||||
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
|
||||
import com.atlassian.bamboo.specs.util.BambooServer;
|
||||
|
||||
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
|
||||
|
||||
/**
|
||||
* Plan configuration for Bamboo.
|
||||
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
|
||||
@ -33,6 +33,8 @@ public class PlanSpec {
|
||||
|
||||
private static final String SERVICE_NAME = "redaction-service";
|
||||
|
||||
private static final String JVM_ARGS =" -Xmx4g -XX:+ExitOnOutOfMemoryError -XX:SurvivorRatio=2 -XX:NewRatio=1 -XX:InitialTenuringThreshold=16 -XX:MaxTenuringThreshold=16 -XX:InitiatingHeapOccupancyPercent=35 ";
|
||||
|
||||
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", "");
|
||||
|
||||
/**
|
||||
@ -82,9 +84,12 @@ public class PlanSpec {
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new ScriptTask()
|
||||
.description("Build")
|
||||
.environmentVariables("MAVEN_OPTS="+JVM_ARGS)
|
||||
.inlineBody("#!/bin/bash\n" +
|
||||
"set -e\n" +
|
||||
|
||||
"export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\"\n" +
|
||||
|
||||
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +
|
||||
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
<parent>
|
||||
<artifactId>platform-dependency</artifactId>
|
||||
<groupId>com.iqser.red</groupId>
|
||||
<version>1.0.8</version>
|
||||
<version>1.1.2</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
@ -32,7 +32,7 @@
|
||||
<dependency>
|
||||
<groupId>com.iqser.red</groupId>
|
||||
<artifactId>platform-commons-dependency</artifactId>
|
||||
<version>1.2.5</version>
|
||||
<version>1.3.1</version>
|
||||
<scope>import</scope>
|
||||
<type>pom</type>
|
||||
</dependency>
|
||||
@ -52,4 +52,4 @@
|
||||
|
||||
</dependencyManagement>
|
||||
|
||||
</project>
|
||||
</project>
|
||||
|
||||
@ -5,13 +5,20 @@ import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class AnalyzeRequest {
|
||||
|
||||
private byte[] document;
|
||||
private String projectId;
|
||||
private String fileId;
|
||||
private String ruleSetId;
|
||||
private boolean reanalyseOnlyIfPossible;
|
||||
private ManualRedactions manualRedactions;
|
||||
private OffsetDateTime lastProcessed;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -11,9 +11,19 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
public class AnalyzeResult {
|
||||
|
||||
private String projectId;
|
||||
private String fileId;
|
||||
private long duration;
|
||||
private int numberOfPages;
|
||||
private RedactionLog redactionLog;
|
||||
private SectionGrid sectionGrid;
|
||||
private Text text;
|
||||
private boolean hasHints;
|
||||
private boolean hasRequests;
|
||||
private boolean hasRedactions;
|
||||
private boolean hasImages;
|
||||
private boolean hasUpdates;
|
||||
private long dictionaryVersion;
|
||||
private long dossierDictionaryVersion;
|
||||
private long rulesVersion;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -11,7 +11,6 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
public class AnnotateRequest {
|
||||
|
||||
private byte[] document;
|
||||
private RedactionLog redactionLog;
|
||||
private SectionGrid sectionGrid;
|
||||
private String projectId;
|
||||
private String fileId;
|
||||
}
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
public enum ChangeType {
|
||||
ADDED, REMOVED
|
||||
}
|
||||
@ -1,12 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@ -27,4 +27,6 @@ public class ManualRedactionEntry {
|
||||
private String section;
|
||||
private int sectionNumber;
|
||||
|
||||
private boolean addToDossierDictionary;
|
||||
|
||||
}
|
||||
|
||||
@ -1,16 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
|
||||
@ -1,25 +1,22 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SectionText {
|
||||
@NoArgsConstructor
|
||||
public class RedactionChangeLog {
|
||||
|
||||
private int sectionNumber;
|
||||
private String text;
|
||||
private List<RedactionChangeLogEntry> redactionLogEntry = new ArrayList<>();
|
||||
|
||||
private boolean isTable;
|
||||
private String headline;
|
||||
private long dictionaryVersion = -1;
|
||||
private long rulesVersion = -1;
|
||||
|
||||
private List<SectionArea> sectionAreas = new ArrayList<>();
|
||||
private String ruleSetId;
|
||||
|
||||
}
|
||||
@ -0,0 +1,47 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class RedactionChangeLogEntry {
|
||||
|
||||
private String id;
|
||||
private String type;
|
||||
private String value;
|
||||
private String reason;
|
||||
private int matchedRule;
|
||||
private String legalBasis;
|
||||
private boolean redacted;
|
||||
private boolean isHint;
|
||||
private boolean isRecommendation;
|
||||
private String section;
|
||||
private float[] color;
|
||||
|
||||
@Builder.Default
|
||||
private List<Rectangle> positions = new ArrayList<>();
|
||||
private int sectionNumber;
|
||||
private boolean manual;
|
||||
private Status status;
|
||||
private ManualRedactionType manualRedactionType;
|
||||
private boolean isDictionaryEntry;
|
||||
|
||||
private String textBefore;
|
||||
private String textAfter;
|
||||
|
||||
@Builder.Default
|
||||
private List<Comment> comments = new ArrayList<>();
|
||||
|
||||
private ChangeType changeType;
|
||||
|
||||
private boolean isDossierDictionaryEntry;
|
||||
|
||||
}
|
||||
@ -1,13 +1,11 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class RedactionLog {
|
||||
|
||||
@ -17,15 +15,17 @@ public class RedactionLog {
|
||||
private long rulesVersion = -1;
|
||||
|
||||
private String ruleSetId;
|
||||
private String filename;
|
||||
|
||||
private long dossierDictionaryVersion = -1;
|
||||
|
||||
|
||||
public RedactionLog(List<RedactionLogEntry> redactionLogEntry, long dictionaryVersion, long rulesVersion, String ruleSetId) {
|
||||
public RedactionLog(List<RedactionLogEntry> redactionLogEntry, long dictionaryVersion, long rulesVersion, String ruleSetId, long dossierDictionaryVersion) {
|
||||
|
||||
this.redactionLogEntry = redactionLogEntry;
|
||||
this.dictionaryVersion = dictionaryVersion;
|
||||
this.rulesVersion = rulesVersion;
|
||||
this.ruleSetId = ruleSetId;
|
||||
this.dossierDictionaryVersion = dossierDictionaryVersion;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@ -45,4 +45,6 @@ public class RedactionLogEntry {
|
||||
|
||||
private boolean isImage;
|
||||
|
||||
private boolean isDossierDictionaryEntry;
|
||||
|
||||
}
|
||||
|
||||
@ -11,7 +11,8 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
public class RedactionRequest {
|
||||
|
||||
private byte[] document;
|
||||
private String projectId;
|
||||
private String fileId;
|
||||
private String ruleSetId;
|
||||
private ManualRedactions manualRedactions;
|
||||
}
|
||||
|
||||
@ -13,7 +13,5 @@ public class RedactionResult {
|
||||
|
||||
private byte[] document;
|
||||
private int numberOfPages;
|
||||
private RedactionLog redactionLog;
|
||||
private SectionGrid sectionGrid;
|
||||
|
||||
}
|
||||
|
||||
@ -1,22 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class RenalyzeRequest {
|
||||
|
||||
private byte[] document;
|
||||
private String ruleSetId;
|
||||
private ManualRedactions manualRedactions;
|
||||
private Text text;
|
||||
private RedactionLog redactionLog;
|
||||
private OffsetDateTime lastProcessed;
|
||||
}
|
||||
@ -27,7 +27,7 @@ public class SectionArea {
|
||||
private String header;
|
||||
|
||||
public boolean contains(Rectangle other) {
|
||||
return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight();
|
||||
return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
|
||||
@ -1,14 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.resources;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
|
||||
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
@ -21,11 +13,6 @@ public interface RedactionResource {
|
||||
String RULE_SET_PARAMETER_NAME = "ruleSetId";
|
||||
String RULE_SET_PATH_VARIABLE = "/{" + RULE_SET_PARAMETER_NAME + "}";
|
||||
|
||||
@PostMapping(value = "/analyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest);
|
||||
|
||||
@PostMapping(value = "/reanalyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest);
|
||||
|
||||
@PostMapping(value = "/annotate", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest);
|
||||
@ -39,10 +26,10 @@ public interface RedactionResource {
|
||||
@PostMapping(value = "/debug/htmlTables", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest);
|
||||
|
||||
@PostMapping(value = "/rules/update"+RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
@PostMapping(value = "/rules/update" + RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
void updateRules(@PathVariable(RULE_SET_PARAMETER_NAME) String ruleSetId);
|
||||
|
||||
@PostMapping(value = "/rules/test", consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
void testRules(@RequestBody String rules);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -12,6 +12,10 @@
|
||||
<artifactId>redaction-service-server-v1</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>storage-commons</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>redaction-service-api-v1</artifactId>
|
||||
@ -20,7 +24,18 @@
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>configuration-service-api-v1</artifactId>
|
||||
<version>2.2.9</version>
|
||||
<version>2.5.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>file-management-service-api-v1</artifactId>
|
||||
<version>2.7.4</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>redaction-service-api-v1</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.drools</groupId>
|
||||
@ -74,6 +89,12 @@
|
||||
<artifactId>spring-cloud-starter-openfeign</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-amqp</artifactId>
|
||||
<version>2.3.1.RELEASE</version>
|
||||
</dependency>
|
||||
|
||||
<!-- test dependencies -->
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
@ -86,9 +107,9 @@
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.12</version>
|
||||
<groupId>org.springframework.amqp</groupId>
|
||||
<artifactId>spring-rabbit-test</artifactId>
|
||||
<version>2.3.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
@ -8,10 +11,6 @@ import org.springframework.boot.context.properties.EnableConfigurationProperties
|
||||
import org.springframework.cloud.openfeign.EnableFeignClients;
|
||||
import org.springframework.context.annotation.Import;
|
||||
|
||||
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
|
||||
@Import({DefaultWebMvcConfiguration.class})
|
||||
@EnableFeignClients(basePackageClasses = RulesClient.class)
|
||||
@EnableConfigurationProperties(RedactionServiceSettings.class)
|
||||
@ -19,8 +18,9 @@ import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettin
|
||||
public class Application {
|
||||
|
||||
public static void main(String[] args) {
|
||||
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
|
||||
SpringApplication.run(Application.class, args);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,20 +1,19 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class Document {
|
||||
@ -33,7 +32,7 @@ public class Document {
|
||||
|
||||
private List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
|
||||
private SectionGrid sectionGrid = new SectionGrid();
|
||||
private long dictionaryVersion;
|
||||
private DictionaryVersion dictionaryVersion;
|
||||
private long rulesVersion;
|
||||
|
||||
private List<SectionText> sectionText = new ArrayList<>();
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
@ -7,38 +9,35 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class FloatFrequencyCounter
|
||||
{
|
||||
public class FloatFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
Map<Float, Integer> countPerValue = new HashMap<>();
|
||||
|
||||
public void add(float value){
|
||||
if(!countPerValue.containsKey(value)){
|
||||
public void add(float value) {
|
||||
if (!countPerValue.containsKey(value)) {
|
||||
countPerValue.put(value, 1);
|
||||
} else {
|
||||
countPerValue.put(value, countPerValue.get(value) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
public void addAll(Map<Float, Integer> otherCounter){
|
||||
for(Map.Entry<Float, Integer> entry: otherCounter.entrySet()){
|
||||
if(countPerValue.containsKey(entry.getKey())){
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey())+ entry.getValue());
|
||||
public void addAll(Map<Float, Integer> otherCounter) {
|
||||
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
|
||||
if (countPerValue.containsKey(entry.getKey())) {
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||
} else {
|
||||
countPerValue.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Float getMostPopular(){
|
||||
public Float getMostPopular() {
|
||||
Map.Entry<Float, Integer> mostPopular = null;
|
||||
for(Map.Entry<Float, Integer> entry: countPerValue.entrySet()){
|
||||
if(mostPopular == null){
|
||||
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null) {
|
||||
mostPopular = entry;
|
||||
} else if(entry.getValue() >= mostPopular.getValue()){
|
||||
} else if (entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
}
|
||||
}
|
||||
@ -46,12 +45,11 @@ public class FloatFrequencyCounter
|
||||
}
|
||||
|
||||
|
||||
|
||||
public List<Float> getHighterThanMostPopular(){
|
||||
public List<Float> getHighterThanMostPopular() {
|
||||
Float mostPopular = getMostPopular();
|
||||
List<Float> higher = new ArrayList<>();
|
||||
for(Float value: countPerValue.keySet()){
|
||||
if(value > mostPopular){
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
if (value > mostPopular) {
|
||||
higher.add(value);
|
||||
}
|
||||
}
|
||||
@ -60,12 +58,12 @@ public class FloatFrequencyCounter
|
||||
}
|
||||
|
||||
|
||||
public Float getHighest(){
|
||||
public Float getHighest() {
|
||||
Float highest = null;
|
||||
for(Float value: countPerValue.keySet()){
|
||||
if (highest == null){
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
if (highest == null) {
|
||||
highest = value;
|
||||
} else if(value > highest){
|
||||
} else if (value > highest) {
|
||||
highest = value;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,19 +1,19 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class Footer {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
@ -21,4 +21,4 @@ public class Footer {
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,19 +1,19 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class Header {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
@ -21,4 +21,4 @@ public class Header {
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,15 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class Page {
|
||||
@ -37,4 +36,4 @@ public class Page {
|
||||
return rotation != 0;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,19 +1,18 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class Paragraph implements Comparable{
|
||||
public class Paragraph implements Comparable {
|
||||
|
||||
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
|
||||
private List<PdfImage> images = new ArrayList<>();
|
||||
@ -62,4 +61,4 @@ public class Paragraph implements Comparable{
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,52 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SectionText {
|
||||
|
||||
private int sectionNumber;
|
||||
private String text;
|
||||
|
||||
private boolean isTable;
|
||||
private String headline;
|
||||
|
||||
private List<SectionArea> sectionAreas = new ArrayList<>();
|
||||
private Set<Image> images = new HashSet<>();
|
||||
|
||||
private List<TextBlock> textBlocks = new ArrayList<>();
|
||||
private Map<String, CellValue> tabularData = new HashMap<>();
|
||||
private List<Integer> cellStarts = new ArrayList<>();
|
||||
|
||||
|
||||
public void setTabularData(Map<String, CellValue> tabularData) {
|
||||
tabularData.remove(null);
|
||||
this.tabularData = tabularData;
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> {
|
||||
if (block != null) {
|
||||
searchableText.addAll(block.getSequences());
|
||||
}
|
||||
});
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class StringFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
@ -46,4 +46,4 @@ public class StringFrequencyCounter {
|
||||
return mostPopular != null ? mostPopular.getKey() : null;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,17 +1,18 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class Text {
|
||||
|
||||
private int numberOfPages;
|
||||
private List<SectionText> sectionTexts = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,19 +1,21 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class TextBlock extends AbstractTextContainer {
|
||||
|
||||
@Builder.Default
|
||||
@ -98,7 +100,6 @@ public class TextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
@ -118,6 +119,7 @@ public class TextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
@Override
|
||||
@JsonIgnore
|
||||
public String getText() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
@ -139,4 +141,4 @@ public class TextBlock extends AbstractTextContainer {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,19 +1,19 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class UnclassifiedText {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
@ -21,4 +21,4 @@ public class UnclassifiedText {
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,21 +1,20 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
|
||||
@ -1,19 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
|
||||
@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.classification.utils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
|
||||
@ -0,0 +1,9 @@
|
||||
package com.iqser.red.service.redaction.v1.server.client;
|
||||
|
||||
|
||||
import com.iqser.red.service.file.management.v1.api.resources.FileStatusProcessingUpdateResource;
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
|
||||
@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${file-management-service.url}")
|
||||
public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource {
|
||||
}
|
||||
@ -1,16 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.client;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.springframework.lang.NonNull;
|
||||
import org.springframework.lang.Nullable;
|
||||
import org.springframework.util.Assert;
|
||||
import org.springframework.util.FileCopyUtils;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
public class MockMultipartFile implements MultipartFile {
|
||||
|
||||
private final String name;
|
||||
@ -22,13 +22,13 @@ public class MockMultipartFile implements MultipartFile {
|
||||
|
||||
public MockMultipartFile(String name, @Nullable byte[] content) {
|
||||
|
||||
this(name, "", (String) null, (byte[]) content);
|
||||
this(name, "", null, content);
|
||||
}
|
||||
|
||||
|
||||
public MockMultipartFile(String name, InputStream contentStream) throws IOException {
|
||||
|
||||
this(name, "", (String) null, (byte[]) FileCopyUtils.copyToByteArray(contentStream));
|
||||
this(name, "", null, FileCopyUtils.copyToByteArray(contentStream));
|
||||
}
|
||||
|
||||
|
||||
@ -78,7 +78,7 @@ public class MockMultipartFile implements MultipartFile {
|
||||
|
||||
public long getSize() {
|
||||
|
||||
return (long) this.content.length;
|
||||
return this.content.length;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -3,6 +3,6 @@ package com.iqser.red.service.redaction.v1.server.client;
|
||||
import com.iqser.red.service.configuration.v1.api.resource.RulesResource;
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
|
||||
@FeignClient(name = RulesResource.SERVICE_NAME, url = "${configuration-service.url}")
|
||||
@FeignClient(name = "RulesResource", url = "${configuration-service.url}")
|
||||
public interface RulesClient extends RulesResource {
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,17 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.controller;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
import com.iqser.red.commons.spring.ErrorMessage;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.web.bind.annotation.ExceptionHandler;
|
||||
import org.springframework.web.bind.annotation.ResponseBody;
|
||||
import org.springframework.web.bind.annotation.ResponseStatus;
|
||||
import org.springframework.web.bind.annotation.RestControllerAdvice;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
@Slf4j
|
||||
@RestControllerAdvice
|
||||
@ -38,4 +36,4 @@ public class ControllerAdvice {
|
||||
return new ErrorMessage(OffsetDateTime.now(), e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,17 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.controller;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
|
||||
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.model.Text;
|
||||
import com.iqser.red.service.redaction.v1.resources.RedactionResource;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
@ -19,27 +12,21 @@ import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.EntityRedactionService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionLogCreatorService;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
@Slf4j
|
||||
@RestController
|
||||
@ -47,61 +34,24 @@ import java.util.List;
|
||||
public class RedactionController implements RedactionResource {
|
||||
|
||||
private final PdfVisualisationService pdfVisualisationService;
|
||||
private final PdfSegmentationService pdfSegmentationService;
|
||||
private final RedactionLogCreatorService redactionLogCreatorService;
|
||||
private final EntityRedactionService entityRedactionService;
|
||||
private final DroolsExecutionService droolsExecutionService;
|
||||
private final DictionaryService dictionaryService;
|
||||
private final AnnotationService annotationService;
|
||||
private final ReanalyzeService reanalyzeService;
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
|
||||
|
||||
@Override
|
||||
public AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest) {
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(analyzeRequest.getDocument()))) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
imageClassificationService.classifyImages(classifiedDoc);
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
|
||||
redactionLogCreatorService.createRedactionLog(classifiedDoc, pdDocument.getNumberOfPages(), analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getRuleSetId());
|
||||
|
||||
log.info("Redaction analysis successful...");
|
||||
|
||||
return AnalyzeResult.builder()
|
||||
.sectionGrid(classifiedDoc.getSectionGrid())
|
||||
.redactionLog(new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion(), classifiedDoc
|
||||
.getRulesVersion(), analyzeRequest.getRuleSetId()))
|
||||
.numberOfPages(classifiedDoc.getPages().size())
|
||||
.text(new Text(classifiedDoc.getSectionText()))
|
||||
.build();
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
|
||||
|
||||
return reanalyzeService.reanalyze(renalyzeRequest);
|
||||
}
|
||||
private final PdfSegmentationService pdfSegmentationService;
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
|
||||
|
||||
public AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest) {
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(annotateRequest.getDocument()))) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(annotateRequest.getProjectId(), annotateRequest.getFileId(), FileType.ORIGIN));
|
||||
var redactionLog = redactionStorageService.getRedactionLog(annotateRequest.getProjectId(), annotateRequest.getFileId());
|
||||
var sectionsGrid = redactionStorageService.getSectionGrid(annotateRequest.getProjectId(), annotateRequest.getFileId());
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
dictionaryService.updateDictionary(annotateRequest.getRedactionLog().getRuleSetId());
|
||||
annotationService.annotate(pdDocument, annotateRequest.getRedactionLog(), annotateRequest.getSectionGrid());
|
||||
|
||||
dictionaryService.updateDictionary(redactionLog.getRuleSetId(), annotateRequest.getProjectId());
|
||||
annotationService.annotate(pdDocument, redactionLog, sectionsGrid);
|
||||
|
||||
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
|
||||
pdDocument.save(byteArrayOutputStream);
|
||||
@ -115,65 +65,80 @@ public class RedactionController implements RedactionResource {
|
||||
|
||||
|
||||
@Override
|
||||
public RedactionResult classify(@RequestBody RedactionRequest pdfSegmentationRequest) {
|
||||
public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
try {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(pdfSegmentationRequest.getDocument()))) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
|
||||
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
|
||||
|
||||
return convert(pdDocument, classifiedDoc.getPages().size(), pdfSegmentationRequest.getRuleSetId());
|
||||
return convert(pdDocument, classifiedDoc.getPages().size());
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
try {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
|
||||
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
|
||||
return convert(pdDocument, classifiedDoc.getPages().size());
|
||||
|
||||
return convert(pdDocument, classifiedDoc.getPages().size(), redactionRequest.getRuleSetId());
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest) {
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
Document classifiedDoc;
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (Page page : classifiedDoc.getPages()) {
|
||||
for (AbstractTextContainer textContainer : page.getTextBlocks()) {
|
||||
if (textContainer instanceof Table) {
|
||||
Table table = (Table) textContainer;
|
||||
sb.append(table.getTextAsHtml()).append("<br />").append("<br />");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return RedactionResult.builder().document(sb.toString().getBytes()).build();
|
||||
|
||||
} catch (IOException e) {
|
||||
try {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true);
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (Page page : classifiedDoc.getPages()) {
|
||||
for (AbstractTextContainer textContainer : page.getTextBlocks()) {
|
||||
if (textContainer instanceof Table) {
|
||||
Table table = (Table) textContainer;
|
||||
sb.append(table.getTextAsHtml()).append("<br />").append("<br />");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return RedactionResult.builder().document(sb.toString().getBytes()).build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -191,26 +156,17 @@ public class RedactionController implements RedactionResource {
|
||||
}
|
||||
|
||||
|
||||
private RedactionResult convert(PDDocument document, int numberOfPages, String ruleSetId) throws IOException {
|
||||
|
||||
return convert(document, numberOfPages, null, null, 0, 0, ruleSetId);
|
||||
}
|
||||
|
||||
|
||||
private RedactionResult convert(PDDocument document, int numberOfPages,
|
||||
List<RedactionLogEntry> redactionLogEntities, SectionGrid sectionGrid,
|
||||
long dictionaryVersion, long rulesVersion, String ruleSetId) throws IOException {
|
||||
private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException {
|
||||
|
||||
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
|
||||
document.save(byteArrayOutputStream);
|
||||
return RedactionResult.builder()
|
||||
.document(byteArrayOutputStream.toByteArray())
|
||||
.numberOfPages(numberOfPages)
|
||||
.redactionLog(new RedactionLog(redactionLogEntities, dictionaryVersion, rulesVersion, ruleSetId))
|
||||
.sectionGrid(sectionGrid)
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,52 @@
|
||||
package com.iqser.red.service.redaction.v1.server.memory;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import java.text.StringCharacterIterator;
|
||||
|
||||
@Slf4j
|
||||
public class MemoryStats {
|
||||
|
||||
|
||||
public static void printMemoryStats() {
|
||||
log.info("\n\n ------------------------------ \n" +
|
||||
" Used Memory: " + humanReadableByteCountBin(getUsedMemory()) + "\n" +
|
||||
" Free Memory: " + humanReadableByteCountBin(getFreeMemory()) + "\n" +
|
||||
" Total Memory: " + humanReadableByteCountBin(getTotalMemory()) + "\n" +
|
||||
" Max Memory: " + humanReadableByteCountBin(getMaxMemory()) + "\n" +
|
||||
"\n ------------------------------ \n");
|
||||
}
|
||||
|
||||
|
||||
public static String humanReadableByteCountBin(long bytes) {
|
||||
long absB = bytes == Long.MIN_VALUE ? Long.MAX_VALUE : Math.abs(bytes);
|
||||
if (absB < 1024) {
|
||||
return bytes + " B";
|
||||
}
|
||||
long value = absB;
|
||||
CharacterIterator ci = new StringCharacterIterator("KMGTPE");
|
||||
for (int i = 40; i >= 0 && absB > 0xfffccccccccccccL >> i; i -= 10) {
|
||||
value >>= 10;
|
||||
ci.next();
|
||||
}
|
||||
value *= Long.signum(bytes);
|
||||
return String.format("%.1f %ciB", value / 1024.0, ci.current());
|
||||
}
|
||||
|
||||
private static long getMaxMemory() {
|
||||
return Runtime.getRuntime().maxMemory();
|
||||
}
|
||||
|
||||
private static long getUsedMemory() {
|
||||
return getMaxMemory() - getFreeMemory();
|
||||
}
|
||||
|
||||
private static long getTotalMemory() {
|
||||
return Runtime.getRuntime().totalMemory();
|
||||
}
|
||||
|
||||
private static long getFreeMemory() {
|
||||
return Runtime.getRuntime().freeMemory();
|
||||
}
|
||||
}
|
||||
@ -1,17 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
public class PDFAreaTextStripper extends PDFTextStripperByArea {
|
||||
|
||||
@Getter
|
||||
@ -76,7 +74,7 @@ public class PDFAreaTextStripper extends PDFTextStripperByArea {
|
||||
}
|
||||
|
||||
|
||||
public void clearPositions(){
|
||||
public void clearPositions() {
|
||||
textPositionSequences = new ArrayList<>();
|
||||
}
|
||||
|
||||
|
||||
@ -1,33 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.reflect.FieldUtils;
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
|
||||
import org.apache.pdfbox.contentstream.operator.color.*;
|
||||
import org.apache.pdfbox.contentstream.operator.state.*;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
@ -40,40 +23,31 @@ import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Slf4j
|
||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
@Setter
|
||||
protected PDPage pdpage;
|
||||
|
||||
@Getter
|
||||
private int minCharWidth;
|
||||
|
||||
@Getter
|
||||
private int maxCharWidth;
|
||||
|
||||
@Getter
|
||||
private int minCharHeight;
|
||||
|
||||
@Getter
|
||||
private int maxCharHeight;
|
||||
|
||||
@Getter
|
||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
|
||||
@Getter
|
||||
private final List<Ruling> rulings = new ArrayList<>();
|
||||
|
||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||
|
||||
@Setter
|
||||
protected PDPage pdpage;
|
||||
@Getter
|
||||
private int minCharWidth;
|
||||
@Getter
|
||||
private int maxCharWidth;
|
||||
@Getter
|
||||
private int minCharHeight;
|
||||
@Getter
|
||||
private int maxCharHeight;
|
||||
@Getter
|
||||
private List<PdfImage> images = new ArrayList<>();
|
||||
|
||||
@ -222,6 +196,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds
|
||||
.getWidth(), (float) imageBounds.getHeight());
|
||||
|
||||
// Memory Hack - sofReference kills me
|
||||
FieldUtils.writeField(pdfImage, "cachedImageSubsampling", -1, true);
|
||||
|
||||
if (rect.getHeight() > 2 && rect.getWidth() > 2) {
|
||||
this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber));
|
||||
}
|
||||
@ -369,4 +346,4 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,52 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class RedTextPosition {
|
||||
|
||||
private String textMatrix;
|
||||
private int rotation;
|
||||
private float y;
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
private String unicode;
|
||||
private float XDirAdj;
|
||||
private float YDirAdj;
|
||||
private float width;
|
||||
private float heightDir;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private float widthOfSpace;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private float fontSizeInPt;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private String fontName;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
var pos = new RedTextPosition();
|
||||
BeanUtils.copyProperties(textPosition, pos);
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
|
||||
pos.setTextMatrix(textPosition.getTextMatrix().toString());
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -1,32 +1,52 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@JsonIgnoreProperties({ "empty" })
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
|
||||
private List<TextPosition> textPositions = new ArrayList<>();
|
||||
private int page;
|
||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
private final int page;
|
||||
private float x1;
|
||||
private float x2;
|
||||
|
||||
public TextPositionSequence(int page) {
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
public static TextPositionSequence fromData(List<RedTextPosition> textPositions, int page) {
|
||||
var textPositionSequence = new TextPositionSequence();
|
||||
textPositionSequence.textPositions = textPositions;
|
||||
textPositionSequence.page = page;
|
||||
|
||||
return textPositionSequence;
|
||||
}
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions;
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
@ -37,7 +57,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
RedTextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return text.charAt(0);
|
||||
}
|
||||
@ -45,7 +65,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
public char charAt(int index, boolean caseInSensitive) {
|
||||
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
RedTextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
|
||||
}
|
||||
@ -54,7 +74,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
@Override
|
||||
public TextPositionSequence subSequence(int start, int end) {
|
||||
|
||||
return new TextPositionSequence(textPositions.subList(start, end), page);
|
||||
return fromData(textPositions.subList(start, end), page);
|
||||
}
|
||||
|
||||
|
||||
@ -69,18 +89,25 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public TextPosition textPositionAt(int index) {
|
||||
public RedTextPosition textPositionAt(int index) {
|
||||
|
||||
return textPositions.get(index);
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
public void add(RedTextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(textPosition);
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getX1() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -91,6 +118,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getX2() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -101,13 +129,14 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getRotationAdjustedY() {
|
||||
|
||||
return textPositions.get(0).getY();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getY1() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -118,6 +147,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getY2() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -128,38 +158,40 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + 2;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
|
||||
return getY2() - getY1();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
|
||||
return getX2() - getX1();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public String getFont() {
|
||||
|
||||
return textPositions.get(0)
|
||||
.getFont()
|
||||
.toString()
|
||||
return textPositions.get(0).getFontName()
|
||||
.toLowerCase()
|
||||
.replaceAll(",bold", "")
|
||||
.replaceAll(",italic", "");
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public String getFontStyle() {
|
||||
|
||||
String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase();
|
||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
||||
|
||||
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
|
||||
return "bold, italic";
|
||||
@ -173,25 +205,25 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getFontSize() {
|
||||
|
||||
return textPositions.get(0).getFontSizeInPt();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getSpaceWidth() {
|
||||
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public int getRotation() {
|
||||
|
||||
return textPositions.get(0).getRotation();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
float height = getTextHeight();
|
||||
@ -223,4 +255,4 @@ public class TextPositionSequence implements CharSequence {
|
||||
return new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,35 @@
|
||||
package com.iqser.red.service.redaction.v1.server.queue;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.amqp.core.Queue;
|
||||
import org.springframework.amqp.core.QueueBuilder;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@RequiredArgsConstructor
|
||||
public class MessagingConfiguration {
|
||||
|
||||
|
||||
public static final String REDACTION_QUEUE = "redactionQueue";
|
||||
|
||||
public static final String REDACTION_DQL = "redactionDQL";
|
||||
|
||||
|
||||
@Bean
|
||||
public Queue redactionQueue() {
|
||||
|
||||
return QueueBuilder.durable(REDACTION_QUEUE)
|
||||
.withArgument("x-dead-letter-exchange", "")
|
||||
.withArgument("x-dead-letter-routing-key", REDACTION_QUEUE)
|
||||
.maxPriority(2)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public Queue redactionDeadLetterQueue() {
|
||||
|
||||
return QueueBuilder.durable(REDACTION_DQL).build();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,54 @@
|
||||
package com.iqser.red.service.redaction.v1.server.queue;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfiguration.REDACTION_DQL;
|
||||
import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfiguration.REDACTION_QUEUE;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class RedactionMessageReceiver {
|
||||
|
||||
private final ObjectMapper objectMapper;
|
||||
private final ReanalyzeService reanalyzeService;
|
||||
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = REDACTION_QUEUE)
|
||||
public void receiveAnalyzeRequest(String in) throws JsonProcessingException {
|
||||
|
||||
var analyzeRequest = objectMapper.readValue(in, AnalyzeRequest.class);
|
||||
log.info("Processing analyze request: {}", analyzeRequest);
|
||||
AnalyzeResult result;
|
||||
if (analyzeRequest.isReanalyseOnlyIfPossible()) {
|
||||
result = reanalyzeService.reanalyze(analyzeRequest);
|
||||
} else {
|
||||
result = reanalyzeService.analyze(analyzeRequest);
|
||||
}
|
||||
log.info("Successfully analyzed {}", analyzeRequest);
|
||||
|
||||
fileStatusProcessingUpdateClient.analysisSuccessful(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), result);
|
||||
}
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = REDACTION_DQL)
|
||||
public void receiveAnalyzeRequestDQL(String in) throws JsonProcessingException {
|
||||
|
||||
var analyzeRequest = objectMapper.readValue(in, AnalyzeRequest.class);
|
||||
log.info("Failed to process analyze request: {}", analyzeRequest);
|
||||
|
||||
fileStatusProcessingUpdateClient.analysisFailed(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,22 +1,25 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import lombok.Value;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
@Value
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class CellValue {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
private List<TextBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
private int rowSpanStart;
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
@ -47,4 +50,4 @@ public class CellValue {
|
||||
.replaceAll(" {2}", " ");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
|
||||
@Data
|
||||
public class Dictionary {
|
||||
|
||||
@ -18,18 +18,18 @@ public class Dictionary {
|
||||
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
|
||||
|
||||
@Getter
|
||||
private long version;
|
||||
private DictionaryVersion version;
|
||||
|
||||
|
||||
public Dictionary(List<DictionaryModel> dictionaryModels, long dictionaryVersion){
|
||||
public Dictionary(List<DictionaryModel> dictionaryModels, DictionaryVersion version) {
|
||||
this.dictionaryModels = dictionaryModels;
|
||||
this.dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), dm));
|
||||
this.version = dictionaryVersion;
|
||||
this.version = version;
|
||||
}
|
||||
|
||||
|
||||
public int getDictionaryRank(String type){
|
||||
if(!localAccessMap.containsKey(type)){
|
||||
public int getDictionaryRank(String type) {
|
||||
if (!localAccessMap.containsKey(type)) {
|
||||
return 0;
|
||||
}
|
||||
return localAccessMap.get(type).getRank();
|
||||
@ -60,7 +60,7 @@ public class Dictionary {
|
||||
|
||||
public boolean containsValue(String type, String value) {
|
||||
|
||||
if (localAccessMap.containsKey(type) && localAccessMap.get(type)
|
||||
return localAccessMap.containsKey(type) && localAccessMap.get(type)
|
||||
.getEntries()
|
||||
.contains(value) || localAccessMap.containsKey(type) && localAccessMap.get(type)
|
||||
.getLocalEntries()
|
||||
@ -68,10 +68,7 @@ public class Dictionary {
|
||||
.getEntries()
|
||||
.contains(value) || localAccessMap.containsKey(RECOMMENDATION_PREFIX + type) && localAccessMap.get(RECOMMENDATION_PREFIX + type)
|
||||
.getLocalEntries()
|
||||
.contains(value)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
.contains(value);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,15 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class DictionaryIncrement {
|
||||
|
||||
private Set<DictionaryIncrementValue> values;
|
||||
private long dictionaryVersion;
|
||||
private DictionaryVersion dictionaryVersion;
|
||||
|
||||
}
|
||||
|
||||
@ -1,15 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class DictionaryModel implements Serializable {
|
||||
@ -22,9 +21,10 @@ public class DictionaryModel implements Serializable {
|
||||
private boolean recommendation;
|
||||
private Set<DictionaryEntry> entries;
|
||||
private Set<String> localEntries;
|
||||
private boolean isDossierDictionary;
|
||||
|
||||
public Set<String> getValues(boolean local){
|
||||
return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e-> e.getValue()).collect(Collectors
|
||||
public Set<String> getValues(boolean local) {
|
||||
return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e -> e.getValue()).collect(Collectors
|
||||
.toSet());
|
||||
}
|
||||
|
||||
|
||||
@ -20,5 +20,4 @@ public class DictionaryRepresentation {
|
||||
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class DictionaryVersion {
|
||||
|
||||
long rulesetVersion;
|
||||
long dossierVersion;
|
||||
}
|
||||
@ -1,13 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class Entity {
|
||||
@ -38,8 +37,10 @@ public class Entity {
|
||||
private String textBefore;
|
||||
private String textAfter;
|
||||
|
||||
private boolean isDossierDictionaryEntry;
|
||||
|
||||
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter, Integer start, Integer end) {
|
||||
|
||||
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter, Integer start, Integer end, boolean isDossierDictionaryEntry) {
|
||||
|
||||
this.word = word;
|
||||
this.type = type;
|
||||
@ -55,10 +56,11 @@ public class Entity {
|
||||
this.textAfter = textAfter;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.isDossierDictionaryEntry = isDossierDictionaryEntry;
|
||||
}
|
||||
|
||||
|
||||
public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber, boolean isDictionaryEntry) {
|
||||
public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber, boolean isDictionaryEntry, boolean isDossierDictionaryEntry) {
|
||||
|
||||
this.word = word;
|
||||
this.type = type;
|
||||
@ -67,6 +69,7 @@ public class Entity {
|
||||
this.headline = headline;
|
||||
this.sectionNumber = sectionNumber;
|
||||
this.isDictionaryEntry = isDictionaryEntry;
|
||||
this.isDossierDictionaryEntry = isDossierDictionaryEntry;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,24 +1,23 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@EqualsAndHashCode
|
||||
public class EntityPositionSequence {
|
||||
|
||||
private final String id;
|
||||
@EqualsAndHashCode.Exclude
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
private int pageNumber;
|
||||
private final String id;
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
@ -14,7 +12,7 @@ import lombok.NoArgsConstructor;
|
||||
public class Image {
|
||||
|
||||
private String type;
|
||||
private Rectangle2D position;
|
||||
private RedRectangle2D position;
|
||||
private boolean redaction;
|
||||
private String redactionReason;
|
||||
private String legalBasis;
|
||||
|
||||
@ -1,28 +1,31 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
public class PdfImage {
|
||||
|
||||
@NonNull
|
||||
@JsonIgnore
|
||||
private BufferedImage image;
|
||||
@NonNull
|
||||
private Rectangle2D position;
|
||||
private RedRectangle2D position;
|
||||
private ImageType imageType;
|
||||
private boolean isAppendedToParagraph;
|
||||
|
||||
@NonNull
|
||||
private int page;
|
||||
|
||||
}
|
||||
public PdfImage(BufferedImage image, Rectangle2D position, int page) {
|
||||
this.image = image;
|
||||
this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight());
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,37 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ReanalysisSection {
|
||||
|
||||
private int sectionNumber;
|
||||
private String headline;
|
||||
private List<TextBlock> textBlocks;
|
||||
private Map<String, CellValue> tabularData = new HashMap<>();
|
||||
private List<Integer> cellStarts;
|
||||
private Set<Image> images = new HashSet<>();
|
||||
|
||||
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> {
|
||||
if (block instanceof TextBlock) {
|
||||
searchableText.addAll(block.getSequences());
|
||||
}
|
||||
});
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,35 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class RedRectangle2D {
|
||||
|
||||
private double x;
|
||||
private double y;
|
||||
private double width;
|
||||
private double height;
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isEmpty() {
|
||||
return width <= 0.0f || height <= 0.0f;
|
||||
}
|
||||
|
||||
public boolean contains(double x, double y, double w, double h) {
|
||||
if (isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = getX();
|
||||
double y0 = getY();
|
||||
return x >= x0 &&
|
||||
y >= y0 &&
|
||||
(x + w) <= x0 + getWidth() &&
|
||||
(y + h) <= y0 + getHeight();
|
||||
}
|
||||
}
|
||||
@ -1,14 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
public class SearchableText {
|
||||
|
||||
private final List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
@ -232,4 +232,4 @@ public class SearchableText {
|
||||
return sb.append("\n").toString();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,6 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
@ -11,15 +17,7 @@ import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
|
||||
|
||||
@Data
|
||||
@Slf4j
|
||||
@ -413,7 +411,7 @@ public class Section {
|
||||
String text = caseInsensitive ? searchText.toLowerCase() : searchText;
|
||||
String searchValue = caseInsensitive ? value.toLowerCase() : value;
|
||||
|
||||
Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, true);
|
||||
Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, true, false);
|
||||
|
||||
found.forEach(entity -> {
|
||||
if (redacted) {
|
||||
@ -439,7 +437,7 @@ public class Section {
|
||||
} else {
|
||||
String word = value.toString();
|
||||
|
||||
Entity entity = new Entity(word, type, value.getRowSpanStart(), value.getRowSpanStart() + word.length(), headline, sectionNumber, false);
|
||||
Entity entity = new Entity(word, type, value.getRowSpanStart(), value.getRowSpanStart() + word.length(), headline, sectionNumber, false, false);
|
||||
entity.setRedaction(redact);
|
||||
entity.setMatchedRule(ruleNumber);
|
||||
entity.setRedactionReason(reason);
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionChangeLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
public class AnalyzeResponseService {
|
||||
|
||||
public AnalyzeResult createAnalyzeResponse(String projectId, String fileId, long duration, int pageCount, RedactionLog redactionLog, RedactionChangeLog redactionChangeLog) {
|
||||
boolean hasHints = redactionLog.getRedactionLogEntry().stream().anyMatch(RedactionLogEntry::isHint);
|
||||
|
||||
boolean hasRequests = redactionLog.getRedactionLogEntry()
|
||||
.stream()
|
||||
.anyMatch(entry -> entry.isManual() && entry.getStatus()
|
||||
.equals(com.iqser.red.service.redaction.v1.model.Status.REQUESTED));
|
||||
|
||||
boolean hasRedactions = redactionLog.getRedactionLogEntry()
|
||||
.stream()
|
||||
.anyMatch(entry -> entry.isRedacted() && !entry.isManual() || entry.isManual() && entry.getStatus()
|
||||
.equals(com.iqser.red.service.redaction.v1.model.Status.APPROVED));
|
||||
|
||||
boolean hasImages = redactionLog.getRedactionLogEntry()
|
||||
.stream()
|
||||
.anyMatch(entry -> entry.isHint() && entry.getType().equals("image"));
|
||||
|
||||
boolean hasUpdates = redactionChangeLog != null && redactionChangeLog.getRedactionLogEntry() != null && !redactionChangeLog
|
||||
.getRedactionLogEntry()
|
||||
.isEmpty() && redactionChangeLog.getRedactionLogEntry().stream().anyMatch(entry -> !entry.getType().equals("false_positive"));
|
||||
|
||||
return AnalyzeResult.builder()
|
||||
.projectId(projectId)
|
||||
.fileId(fileId)
|
||||
.duration(duration)
|
||||
.numberOfPages(pageCount)
|
||||
.hasHints(hasHints)
|
||||
.hasRedactions(hasRedactions)
|
||||
.hasRequests(hasRequests)
|
||||
.hasImages(hasImages)
|
||||
.hasUpdates(hasUpdates)
|
||||
.rulesVersion(redactionLog.getRulesVersion())
|
||||
.dictionaryVersion(redactionLog.getDictionaryVersion())
|
||||
.dossierDictionaryVersion(redactionLog.getDossierDictionaryVersion())
|
||||
.build();
|
||||
}
|
||||
}
|
||||
@ -1,14 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.GregorianCalendar;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
@ -21,15 +14,14 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
|
||||
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.CellRectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.Comment;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import java.awt.Color;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.GregorianCalendar;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
|
||||
@ -1,19 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
import static com.iqser.red.service.configuration.v1.api.resource.DictionaryResource.GLOBAL_DOSSIER;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.Colors;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
@ -25,10 +12,18 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncre
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
|
||||
|
||||
import feign.FeignException;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -37,53 +32,69 @@ public class DictionaryService {
|
||||
|
||||
private final DictionaryClient dictionaryClient;
|
||||
|
||||
private Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>();
|
||||
private final Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>();
|
||||
private final Map<String, DictionaryRepresentation> dictionariesByDossier = new HashMap<>();
|
||||
|
||||
|
||||
public long updateDictionary(String ruleSetId) {
|
||||
public DictionaryVersion updateDictionary(String ruleSetId, String dossierId) {
|
||||
|
||||
long version = dictionaryClient.getVersion(ruleSetId);
|
||||
|
||||
var foundDictionary = dictionariesByRuleSets.get(ruleSetId);
|
||||
|
||||
if (foundDictionary == null || version > foundDictionary.getDictionaryVersion()) {
|
||||
updateDictionaryEntry(ruleSetId, version);
|
||||
long rulesetDictionaryVersion = dictionaryClient.getVersion(ruleSetId, GLOBAL_DOSSIER);
|
||||
var rulesetDictionary = dictionariesByRuleSets.get(ruleSetId);
|
||||
if (rulesetDictionary == null || rulesetDictionaryVersion > rulesetDictionary.getDictionaryVersion()) {
|
||||
updateDictionaryEntry(ruleSetId, rulesetDictionaryVersion, GLOBAL_DOSSIER);
|
||||
}
|
||||
|
||||
return version;
|
||||
long dossierDictionaryVersion = dictionaryClient.getVersion(ruleSetId, dossierId);
|
||||
var dossierDictionary = dictionariesByDossier.get(dossierId);
|
||||
if (dossierDictionary == null || dossierDictionaryVersion > dossierDictionary.getDictionaryVersion()) {
|
||||
updateDictionaryEntry(ruleSetId, dossierDictionaryVersion, dossierId);
|
||||
}
|
||||
|
||||
return DictionaryVersion.builder().rulesetVersion(rulesetDictionaryVersion).dossierVersion(dossierDictionaryVersion).build();
|
||||
}
|
||||
|
||||
|
||||
public DictionaryIncrement getDictionaryIncrements(String ruleSetId, long fromVersion) {
|
||||
public DictionaryIncrement getDictionaryIncrements(String ruleSetId, DictionaryVersion fromVersion, String dossierId) {
|
||||
|
||||
long version = updateDictionary(ruleSetId);
|
||||
DictionaryVersion version = updateDictionary(ruleSetId, dossierId);
|
||||
|
||||
Set<DictionaryIncrementValue> newValues = new HashSet<>();
|
||||
List<DictionaryModel> dictionaryModels = dictionariesByRuleSets.get(ruleSetId).getDictionary();
|
||||
dictionaryModels.forEach(dictionaryModel -> {
|
||||
dictionaryModel.getEntries().forEach(dictionaryEntry -> {
|
||||
if (dictionaryEntry.getVersion() > fromVersion) {
|
||||
if (dictionaryEntry.getVersion() > fromVersion.getRulesetVersion()) {
|
||||
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
if(dictionariesByDossier.containsKey(dossierId)) {
|
||||
dictionaryModels = dictionariesByDossier.get(dossierId).getDictionary();
|
||||
dictionaryModels.forEach(dictionaryModel -> {
|
||||
dictionaryModel.getEntries().forEach(dictionaryEntry -> {
|
||||
if (dictionaryEntry.getVersion() > fromVersion.getDossierVersion()) {
|
||||
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return new DictionaryIncrement(newValues, version);
|
||||
}
|
||||
|
||||
|
||||
private void updateDictionaryEntry(String ruleSetId, long version) {
|
||||
private void updateDictionaryEntry(String ruleSetId, long version, String dossierId) {
|
||||
|
||||
try {
|
||||
DictionaryRepresentation dictionaryRepresentation = new DictionaryRepresentation();
|
||||
|
||||
TypeResponse typeResponse = dictionaryClient.getAllTypes(ruleSetId);
|
||||
TypeResponse typeResponse = dictionaryClient.getAllTypes(ruleSetId, dossierId);
|
||||
if (typeResponse != null && CollectionUtils.isNotEmpty(typeResponse.getTypes())) {
|
||||
|
||||
List<DictionaryModel> dictionary = typeResponse.getTypes()
|
||||
.stream()
|
||||
.map(t -> new DictionaryModel(t.getType(), t.getRank(), convertColor(t.getHexColor()), t.isCaseInsensitive(), t
|
||||
.isHint(), t.isRecommendation(), convertEntries(t), new HashSet<>()))
|
||||
.isHint(), t.isRecommendation(), convertEntries(t, dossierId), new HashSet<>(),dossierId.equals(GLOBAL_DOSSIER) ? false : true))
|
||||
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
@ -99,7 +110,11 @@ public class DictionaryService {
|
||||
dictionaryRepresentation.setDictionaryVersion(version);
|
||||
dictionaryRepresentation.setDictionary(dictionary);
|
||||
|
||||
dictionariesByRuleSets.put(ruleSetId, dictionaryRepresentation);
|
||||
if(dossierId.equals(GLOBAL_DOSSIER)) {
|
||||
dictionariesByRuleSets.put(ruleSetId, dictionaryRepresentation);
|
||||
} else {
|
||||
dictionariesByDossier.put(dossierId, dictionaryRepresentation);
|
||||
}
|
||||
}
|
||||
} catch (FeignException e) {
|
||||
log.warn("Got some unknown feignException", e);
|
||||
@ -112,19 +127,19 @@ public class DictionaryService {
|
||||
|
||||
dictionary.getDictionaryModels().forEach(dm -> {
|
||||
if (dm.isRecommendation() && !dm.getLocalEntries().isEmpty()) {
|
||||
dictionaryClient.addEntries(dm.getType(), ruleSetId, new ArrayList<>(dm.getLocalEntries()), false);
|
||||
long externalVersion = dictionaryClient.getVersion(ruleSetId);
|
||||
if (externalVersion == dictionary.getVersion() + 1) {
|
||||
dictionary.setVersion(externalVersion);
|
||||
dictionaryClient.addEntries(dm.getType(), ruleSetId, new ArrayList<>(dm.getLocalEntries()), false, GLOBAL_DOSSIER);
|
||||
long externalVersion = dictionaryClient.getVersion(ruleSetId, GLOBAL_DOSSIER);
|
||||
if (externalVersion == dictionary.getVersion().getRulesetVersion() + 1) {
|
||||
dictionary.getVersion().setRulesetVersion(externalVersion);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private Set<DictionaryEntry> convertEntries(TypeResult t) {
|
||||
private Set<DictionaryEntry> convertEntries(TypeResult t, String dossierId) {
|
||||
|
||||
Set<DictionaryEntry> entries = new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId())
|
||||
Set<DictionaryEntry> entries = new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId(), dossierId)
|
||||
.getEntries());
|
||||
|
||||
if (t.isCaseInsensitive()) {
|
||||
@ -181,17 +196,26 @@ public class DictionaryService {
|
||||
}
|
||||
|
||||
|
||||
public Dictionary getDeepCopyDictionary(String ruleSetId) {
|
||||
public Dictionary getDeepCopyDictionary(String ruleSetId, String dossierId) {
|
||||
|
||||
List<DictionaryModel> copy = new ArrayList<>();
|
||||
|
||||
var representation = dictionariesByRuleSets.get(ruleSetId);
|
||||
var dictionary = dictionariesByRuleSets.get(ruleSetId).getDictionary();
|
||||
dictionary.forEach(dm -> {
|
||||
var rulesetRepresentation = dictionariesByRuleSets.get(ruleSetId);
|
||||
rulesetRepresentation.getDictionary().forEach(dm -> {
|
||||
copy.add(SerializationUtils.clone(dm));
|
||||
});
|
||||
|
||||
return new Dictionary(copy, representation.getDictionaryVersion());
|
||||
//TODO merge dictionaries if they have same names
|
||||
long dossierDictionaryVersion = -1;
|
||||
if(dictionariesByDossier.containsKey(dossierId)) {
|
||||
var dossierRepresentation = dictionariesByDossier.get(dossierId);
|
||||
dossierRepresentation.getDictionary().forEach(dm -> {
|
||||
copy.add(SerializationUtils.clone(dm));
|
||||
});
|
||||
dossierDictionaryVersion = dossierRepresentation.getDictionaryVersion();
|
||||
}
|
||||
|
||||
return new Dictionary(copy, DictionaryVersion.builder().rulesetVersion(rulesetRepresentation.getDictionaryVersion()).dossierVersion(dossierDictionaryVersion).build());
|
||||
}
|
||||
|
||||
|
||||
@ -212,4 +236,4 @@ public class DictionaryService {
|
||||
return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.kie.api.KieServices;
|
||||
import org.kie.api.builder.KieBuilder;
|
||||
@ -15,12 +14,11 @@ import org.kie.api.runtime.KieContainer;
|
||||
import org.kie.api.runtime.KieSession;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@ -28,9 +26,9 @@ public class DroolsExecutionService {
|
||||
|
||||
private final RulesClient rulesClient;
|
||||
|
||||
private Map<String, KieContainer> kieContainers = new HashMap<>();
|
||||
private final Map<String, KieContainer> kieContainers = new HashMap<>();
|
||||
|
||||
private Map<String, Long> rulesVersionPerRuleSetId = new HashMap<>();
|
||||
private final Map<String, Long> rulesVersionPerRuleSetId = new HashMap<>();
|
||||
|
||||
|
||||
public KieContainer getKieContainer(String ruleSetId) {
|
||||
@ -133,4 +131,4 @@ public class DroolsExecutionService {
|
||||
return rulesVersion.longValue();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,50 +1,27 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -56,13 +33,13 @@ public class EntityRedactionService {
|
||||
private final SurroundingWordsService surroundingWordsService;
|
||||
|
||||
|
||||
public void processDocument(Document classifiedDoc, String ruleSetId, ManualRedactions manualRedactions) {
|
||||
public void processDocument(Document classifiedDoc, String ruleSetId, ManualRedactions manualRedactions, String dossierId) {
|
||||
|
||||
dictionaryService.updateDictionary(ruleSetId);
|
||||
dictionaryService.updateDictionary(ruleSetId, dossierId);
|
||||
KieContainer container = droolsExecutionService.updateRules(ruleSetId);
|
||||
long rulesVersion = droolsExecutionService.getRulesVersion(ruleSetId);
|
||||
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(ruleSetId);
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(ruleSetId, dossierId);
|
||||
|
||||
Set<Entity> documentEntities = new HashSet<>(findEntities(classifiedDoc, container, manualRedactions, dictionary, false, null));
|
||||
|
||||
@ -94,7 +71,7 @@ public class EntityRedactionService {
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
|
||||
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
|
||||
.getStart(), entity.getEnd()));
|
||||
.getStart(), entity.getEnd(), entity.isDossierDictionaryEntry()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -210,6 +187,7 @@ public class EntityRedactionService {
|
||||
.get(0)
|
||||
.getPage());
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
|
||||
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
|
||||
int cellStart = start;
|
||||
@ -258,6 +236,8 @@ public class EntityRedactionService {
|
||||
sectionText.setHeadline(table.getHeadline());
|
||||
sectionText.setSectionNumber(sectionNumber.intValue());
|
||||
sectionText.setTable(true);
|
||||
sectionText.setTabularData(tabularData);
|
||||
sectionText.setCellStarts(cellStarts);
|
||||
classifiedDoc.getSectionText().add(sectionText);
|
||||
}
|
||||
|
||||
@ -290,6 +270,7 @@ public class EntityRedactionService {
|
||||
.getSequences()
|
||||
.get(0)
|
||||
.getPage());
|
||||
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
}
|
||||
|
||||
@ -348,6 +329,10 @@ public class EntityRedactionService {
|
||||
sectionText.setHeadline(headline);
|
||||
sectionText.setSectionNumber(sectionNumber.intValue());
|
||||
sectionText.setTable(false);
|
||||
sectionText.setImages(images.stream()
|
||||
.map(image -> convert(image, sectionNumber.intValue(), headline))
|
||||
.collect(Collectors.toSet()));
|
||||
sectionText.setTextBlocks(paragraphTextBlocks);
|
||||
classifiedDoc.getSectionText().add(sectionText);
|
||||
}
|
||||
|
||||
@ -386,9 +371,9 @@ public class EntityRedactionService {
|
||||
String lowercaseInputString = searchableString.toLowerCase();
|
||||
for (DictionaryModel model : dictionary.getDictionaryModels()) {
|
||||
if (model.isCaseInsensitive()) {
|
||||
found.addAll(EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local));
|
||||
found.addAll(EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local, model.isDossierDictionary()));
|
||||
} else {
|
||||
found.addAll(EntitySearchUtils.find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local));
|
||||
found.addAll(EntitySearchUtils.find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local, model.isDossierDictionary()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,21 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -26,37 +22,41 @@ public class ImageClassificationService {
|
||||
private final RedactionServiceSettings settings;
|
||||
|
||||
|
||||
public void classifyImages(Document classifiedDoc) {
|
||||
public void classifyImages(Page page) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
classifiedDoc.getPages().forEach(page -> {
|
||||
page.getImages().forEach(image -> {
|
||||
page.getImages().forEach(image -> {
|
||||
|
||||
if (settings.isEnableImageClassification()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos
|
||||
.toByteArray()));
|
||||
image.setImageType(ImageType.valueOf(response.getCategory()));
|
||||
if (settings.isEnableImageClassification()) {
|
||||
|
||||
} catch (IOException e) {
|
||||
log.error("Could not classify image", e);
|
||||
}
|
||||
} else {
|
||||
long start = System.currentTimeMillis();
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray());
|
||||
ImageClassificationResponse response = imageClassificationClient.classify(mockFile);
|
||||
image.setImageType(ImageType.valueOf(response.getCategory()));
|
||||
} catch (Exception e) {
|
||||
log.error("Could not classify image", e);
|
||||
image.setImageType(ImageType.OTHER);
|
||||
}
|
||||
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
page.getTextBlocks().forEach(textblock -> {
|
||||
if (image.getPosition()
|
||||
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
log.info("Image classification took: " + (System.currentTimeMillis() - start));
|
||||
} else {
|
||||
image.setImageType(ImageType.OTHER);
|
||||
}
|
||||
|
||||
image.getImage().flush();
|
||||
image.setImage(null);
|
||||
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
page.getTextBlocks().forEach(textblock -> {
|
||||
if (image.getPosition()
|
||||
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
log.info("Image classification took: " + (System.currentTimeMillis() - start));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,53 +1,30 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Comment;
|
||||
import com.iqser.red.service.redaction.v1.model.IdRemoval;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class ReanalyzeService {
|
||||
@ -57,39 +34,98 @@ public class ReanalyzeService {
|
||||
private final SurroundingWordsService surroundingWordsService;
|
||||
private final EntityRedactionService entityRedactionService;
|
||||
private final RedactionLogCreatorService redactionLogCreatorService;
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
private final PdfSegmentationService pdfSegmentationService;
|
||||
private final RedactionChangeLogService redactionChangeLogService;
|
||||
private final AnalyzeResponseService analyzeResponseService;
|
||||
|
||||
|
||||
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
|
||||
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
|
||||
|
||||
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), renalyzeRequest
|
||||
.getRedactionLog()
|
||||
.getDictionaryVersion());
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
|
||||
var pageCount = 0;
|
||||
Document classifiedDoc;
|
||||
|
||||
try {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest
|
||||
.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
|
||||
pageCount = classifiedDoc.getPages().size();
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getProjectId());
|
||||
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getRuleSetId());
|
||||
|
||||
log.info("Redaction analysis successful...");
|
||||
|
||||
var redactionLog = new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion()
|
||||
.getRulesetVersion(), classifiedDoc.getRulesVersion(), analyzeRequest.getRuleSetId(), classifiedDoc.getDictionaryVersion()
|
||||
.getDossierVersion());
|
||||
|
||||
log.info("Analyzed with rules {} and dictionary {} for ruleSet: {}", classifiedDoc.getRulesVersion(), classifiedDoc
|
||||
.getDictionaryVersion(), analyzeRequest.getRuleSetId());
|
||||
|
||||
// first create changelog - this only happens when we migrate files analyzed via the old process and we don't want to loose changeLog data
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
|
||||
// store redactionLog
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.TEXT, new Text(pageCount, classifiedDoc
|
||||
.getSectionText()));
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc
|
||||
.getSectionGrid());
|
||||
|
||||
long duration = System.currentTimeMillis() - startTime;
|
||||
return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), duration, pageCount, redactionLog, changeLog);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
|
||||
var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
|
||||
|
||||
// not yet ready for reanalysis
|
||||
if (redactionLog == null || text == null || text.getNumberOfPages() == 0) {
|
||||
return analyze(analyzeRequest);
|
||||
}
|
||||
|
||||
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), new DictionaryVersion(redactionLog
|
||||
.getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getProjectId());
|
||||
|
||||
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions());
|
||||
Map<String, List<Comment>> comments = null;
|
||||
Set<ManualRedactionEntry> manualAdds = null;
|
||||
|
||||
if (renalyzeRequest.getManualRedactions() != null) {
|
||||
if (analyzeRequest.getManualRedactions() != null) {
|
||||
// TODO comments will be removed from redactionLog, so we ignore this first.
|
||||
comments = renalyzeRequest.getManualRedactions().getComments();
|
||||
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
|
||||
comments = analyzeRequest.getManualRedactions().getComments();
|
||||
manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd();
|
||||
}
|
||||
|
||||
Set<Integer> sectionsToReanaylse = new HashSet<>();
|
||||
Set<Integer> sectionsToReanalyse = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imageEntries = new HashMap<>();
|
||||
for (RedactionLogEntry entry : renalyzeRequest.getRedactionLog().getRedactionLogEntry()) {
|
||||
for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) {
|
||||
if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) {
|
||||
sectionsToReanaylse.add(entry.getSectionNumber());
|
||||
sectionsToReanalyse.add(entry.getSectionNumber());
|
||||
}
|
||||
if (entry.isImage() || entry.getType().equals("image")) {
|
||||
imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry));
|
||||
}
|
||||
}
|
||||
|
||||
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
|
||||
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
|
||||
sectionsToReanaylse.add(sectionText.getSectionNumber());
|
||||
sectionsToReanalyse.add(sectionText.getSectionNumber());
|
||||
}
|
||||
|
||||
if (manualAdds != null) {
|
||||
@ -106,187 +142,123 @@ public class ReanalyzeService {
|
||||
}
|
||||
}
|
||||
|
||||
if (sectionsToReanaylse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
|
||||
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
|
||||
log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest);
|
||||
|
||||
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
|
||||
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement);
|
||||
}
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(renalyzeRequest.getDocument()))) {
|
||||
List<SectionText> reanalysisSections = new ArrayList<>();
|
||||
|
||||
List<ReanalysisSection> reanalysisSections = new ArrayList<>();
|
||||
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
|
||||
if (!sectionsToReanaylse.contains(sectionText.getSectionNumber())) {
|
||||
continue;
|
||||
}
|
||||
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
|
||||
reanalysisSections.add(sectionText);
|
||||
}
|
||||
}
|
||||
|
||||
ReanalysisSection reanalysisSection = new ReanalysisSection();
|
||||
reanalysisSection.setHeadline(sectionText.getHeadline());
|
||||
reanalysisSection.setSectionNumber(sectionText.getSectionNumber());
|
||||
List<TextBlock> textBlocks = new ArrayList<>();
|
||||
//--
|
||||
|
||||
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
|
||||
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
|
||||
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
|
||||
.add(sectionArea);
|
||||
}
|
||||
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId());
|
||||
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
List<Integer> cellStarts = new ArrayList<>();
|
||||
for (Integer page : sectionAreasPerPage.keySet()) {
|
||||
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId(), analyzeRequest.getProjectId());
|
||||
|
||||
PDPage pdPage = pdDocument.getPage(page - 1);
|
||||
PDRectangle cropBox = pdPage.getCropBox();
|
||||
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
|
||||
textStripper.setPageNumber(page);
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (SectionText reanalysisSection : reanalysisSections) {
|
||||
|
||||
int cellStart = 0;
|
||||
for (SectionArea sectionArea : areasOnPage) {
|
||||
|
||||
Rectangle2D rect = null;
|
||||
if (pdPage.getRotation() == 90) {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
|
||||
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
|
||||
} else {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
|
||||
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
|
||||
.getHeight() + 0.001f);
|
||||
}
|
||||
|
||||
textStripper.addRegion(String.valueOf(1), rect);
|
||||
textStripper.extractRegions(pdPage);
|
||||
textStripper.getTextForRegion(String.valueOf(1));
|
||||
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
|
||||
|
||||
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
|
||||
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
|
||||
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
|
||||
|
||||
if (sectionText.isTable()) {
|
||||
Cell cell = new Cell();
|
||||
cell.addTextBlock(textBlock);
|
||||
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
|
||||
cellStarts.add(cellStart);
|
||||
cellStart = cellStart + cell.toString().trim().length() + 1;
|
||||
}
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
textStripper.clearPositions();
|
||||
}
|
||||
|
||||
}
|
||||
reanalysisSection.setTextBlocks(textBlocks);
|
||||
reanalysisSection.setTabularData(tabularData);
|
||||
|
||||
if (sectionText.isTable()) {
|
||||
reanalysisSection.setCellStarts(cellStarts);
|
||||
}
|
||||
if (imageEntries.containsKey(sectionText.getSectionNumber())) {
|
||||
reanalysisSection.getImages().addAll(imageEntries.get(sectionText.getSectionNumber()));
|
||||
}
|
||||
|
||||
reanalysisSections.add(reanalysisSection);
|
||||
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
|
||||
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
|
||||
if (reanalysisSection.getCellStarts() != null) {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
|
||||
.getCellStarts());
|
||||
} else {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
|
||||
}
|
||||
|
||||
//--
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(false)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(entities)
|
||||
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
|
||||
.searchText(reanalysisSection.getSearchableText().toString())
|
||||
.headline(reanalysisSection.getHeadline())
|
||||
.sectionNumber(reanalysisSection.getSectionNumber())
|
||||
.tabularData(reanalysisSection.getTabularData())
|
||||
.searchableText(reanalysisSection.getSearchableText())
|
||||
.dictionary(dictionary)
|
||||
.images(reanalysisSection.getImages())
|
||||
.build(), reanalysisSection.getSearchableText()));
|
||||
}
|
||||
|
||||
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
|
||||
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
|
||||
entities.addAll(analysedRowSection.getEntities());
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
|
||||
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (ReanalysisSection reanalysisSection : reanalysisSections) {
|
||||
|
||||
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
|
||||
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
|
||||
if (reanalysisSection.getCellStarts() != null) {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
|
||||
.getCellStarts());
|
||||
} else {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
|
||||
}
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(false)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(entities)
|
||||
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
|
||||
.searchText(reanalysisSection.getSearchableText().toString())
|
||||
.headline(reanalysisSection.getHeadline())
|
||||
.sectionNumber(reanalysisSection.getSectionNumber())
|
||||
.tabularData(reanalysisSection.getTabularData())
|
||||
.searchableText(reanalysisSection.getSearchableText())
|
||||
.dictionary(dictionary)
|
||||
.images(reanalysisSection.getImages())
|
||||
.build(), reanalysisSection.getSearchableText()));
|
||||
for (Image image : analysedRowSection.getImages()) {
|
||||
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
|
||||
}
|
||||
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
|
||||
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
|
||||
.getSection());
|
||||
entities.addAll(analysedRowSection.getEntities());
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
});
|
||||
|
||||
for (Image image : analysedRowSection.getImages()) {
|
||||
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
|
||||
for (Entity entity : entities) {
|
||||
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
|
||||
.add(entityPositionSequence);
|
||||
}
|
||||
|
||||
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
|
||||
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
|
||||
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
|
||||
.getStart(), entity.getEnd()));
|
||||
}
|
||||
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
|
||||
for (Entity entity : entities) {
|
||||
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
|
||||
.add(entityPositionSequence);
|
||||
}
|
||||
|
||||
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
|
||||
for (int page = 1; page <= pdDocument.getNumberOfPages(); page++) {
|
||||
if (entitiesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
|
||||
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
|
||||
}
|
||||
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
|
||||
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
|
||||
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
|
||||
.getStart(), entity.getEnd(), entity.isDossierDictionaryEntry()));
|
||||
}
|
||||
}
|
||||
|
||||
if (imagesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest
|
||||
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest
|
||||
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
|
||||
for (int page = 1; page <= text.getNumberOfPages(); page++) {
|
||||
if (entitiesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest.getManualRedactions(), page, analyzeRequest
|
||||
.getRuleSetId()));
|
||||
}
|
||||
|
||||
Iterator<RedactionLogEntry> itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator();
|
||||
while (itty.hasNext()) {
|
||||
RedactionLogEntry entry = itty.next();
|
||||
if (sectionsToReanaylse.contains(entry.getSectionNumber())) {
|
||||
itty.remove();
|
||||
}
|
||||
if (imagesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest.getManualRedactions(), page, analyzeRequest
|
||||
.getRuleSetId()));
|
||||
}
|
||||
|
||||
renalyzeRequest.getRedactionLog().getRedactionLogEntry().addAll(newRedactionLogEntries);
|
||||
|
||||
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
|
||||
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest
|
||||
.getRuleSetId()));
|
||||
}
|
||||
|
||||
redactionLog.getRedactionLogEntry()
|
||||
.removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry
|
||||
.isImage());
|
||||
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
|
||||
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private AnalyzeResult finalizeAnalysis(@RequestBody AnalyzeRequest analyzeRequest, long startTime,
|
||||
RedactionLog redactionLog, Text text,
|
||||
DictionaryIncrement dictionaryIncrement) {
|
||||
|
||||
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getRulesetVersion());
|
||||
redactionLog.setDossierDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getDossierVersion());
|
||||
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
|
||||
long duration = System.currentTimeMillis() - startTime;
|
||||
|
||||
return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), duration, text
|
||||
.getNumberOfPages(), redactionLog, changeLog);
|
||||
}
|
||||
|
||||
|
||||
@ -309,7 +281,7 @@ public class ReanalyzeService {
|
||||
|
||||
return Image.builder()
|
||||
.type(entry.getType())
|
||||
.position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft()
|
||||
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
|
||||
.getY(), position.getWidth(), position.getHeight()))
|
||||
.sectionNumber(entry.getSectionNumber())
|
||||
.section(entry.getSection())
|
||||
|
||||
@ -0,0 +1,93 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.ChangeType;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionChangeLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionChangeLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class RedactionChangeLogService {
|
||||
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
|
||||
public RedactionChangeLog createAndStoreChangeLog(String projectId, String fileId, RedactionLog currentRedactionLog) {
|
||||
|
||||
try {
|
||||
RedactionLog previousRedactionLog = redactionStorageService.getRedactionLog(projectId, fileId);
|
||||
var changeLog = createChangeLog(currentRedactionLog, previousRedactionLog);
|
||||
redactionStorageService.storeObject(projectId, fileId, FileType.REDACTION_CHANGELOG, changeLog);
|
||||
return changeLog;
|
||||
} catch (Exception e) {
|
||||
log.debug("Previous redaction log not available");
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private RedactionChangeLog createChangeLog(RedactionLog currentRedactionLog, RedactionLog previousRedactionLog) {
|
||||
|
||||
|
||||
if (previousRedactionLog == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<RedactionLogEntry> added = new ArrayList<>(currentRedactionLog.getRedactionLogEntry());
|
||||
added.removeAll(previousRedactionLog.getRedactionLogEntry());
|
||||
|
||||
List<RedactionLogEntry> removed = new ArrayList<>(previousRedactionLog.getRedactionLogEntry());
|
||||
removed.removeAll(currentRedactionLog.getRedactionLogEntry());
|
||||
|
||||
List<RedactionChangeLogEntry> changeLogEntries = added.stream()
|
||||
.map(entry -> convert(entry, ChangeType.ADDED))
|
||||
.collect(Collectors.toList());
|
||||
changeLogEntries.addAll(removed.stream()
|
||||
.map(entry -> convert(entry, ChangeType.REMOVED))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return new RedactionChangeLog(changeLogEntries, currentRedactionLog.getDictionaryVersion(), currentRedactionLog.getRulesVersion(), currentRedactionLog
|
||||
.getRuleSetId());
|
||||
}
|
||||
|
||||
|
||||
private RedactionChangeLogEntry convert(RedactionLogEntry entry, ChangeType changeType) {
|
||||
|
||||
return RedactionChangeLogEntry.builder()
|
||||
.id(entry.getId())
|
||||
.type(entry.getType())
|
||||
.value(entry.getValue())
|
||||
.reason(entry.getReason())
|
||||
.matchedRule(entry.getMatchedRule())
|
||||
.legalBasis(entry.getLegalBasis())
|
||||
.redacted(entry.isRedacted())
|
||||
.isHint(entry.isHint())
|
||||
.isRecommendation(entry.isRecommendation())
|
||||
.section(entry.getSection())
|
||||
.color(entry.getColor())
|
||||
.positions(entry.getPositions())
|
||||
.sectionNumber(entry.getSectionNumber())
|
||||
.manual(entry.isManual())
|
||||
.status(entry.getStatus())
|
||||
.manualRedactionType(entry.getManualRedactionType())
|
||||
.isDictionaryEntry(entry.isDictionaryEntry())
|
||||
.textBefore(entry.getTextBefore())
|
||||
.textAfter(entry.getTextAfter())
|
||||
.comments(entry.getComments())
|
||||
.changeType(changeType)
|
||||
.isDossierDictionaryEntry(entry.isDossierDictionaryEntry())
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,31 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.CellRectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.Comment;
|
||||
import com.iqser.red.service.redaction.v1.model.IdRemoval;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionType;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.Status;
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
@ -34,8 +13,16 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@ -285,24 +272,24 @@ public class RedactionLogCreatorService {
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle> getRectanglesPerLine(List<TextPosition> textPositions, int page) {
|
||||
private List<Rectangle> getRectanglesPerLine(List<RedTextPosition> textPositions, int page) {
|
||||
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
if (textPositions.size() == 1) {
|
||||
rectangles.add(new TextPositionSequence(textPositions, page).getRectangle());
|
||||
rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle());
|
||||
} else {
|
||||
float y = textPositions.get(0).getYDirAdj();
|
||||
int startIndex = 0;
|
||||
for (int i = 1; i < textPositions.size(); i++) {
|
||||
float yDirAdj = textPositions.get(i).getYDirAdj();
|
||||
if (yDirAdj != y) {
|
||||
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle());
|
||||
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle());
|
||||
y = yDirAdj;
|
||||
startIndex = i;
|
||||
}
|
||||
}
|
||||
if (startIndex != textPositions.size()) {
|
||||
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
|
||||
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
|
||||
}
|
||||
}
|
||||
|
||||
@ -368,6 +355,7 @@ public class RedactionLogCreatorService {
|
||||
.status(manualRedactionEntry.getStatus())
|
||||
.manualRedactionType(ManualRedactionType.ADD)
|
||||
.isDictionaryEntry(false)
|
||||
.isDossierDictionaryEntry(manualRedactionEntry.isAddToDossierDictionary())
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -391,6 +379,7 @@ public class RedactionLogCreatorService {
|
||||
.textBefore(entity.getTextBefore())
|
||||
.startOffset(entity.getStart())
|
||||
.endOffset(entity.getEnd())
|
||||
.isDossierDictionaryEntry(entity.isDossierDictionaryEntry())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@ -1,25 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
@SuppressWarnings("PMD")
|
||||
@ -46,7 +38,7 @@ public class EntitySearchUtils {
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
if(value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())){
|
||||
if (value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -57,7 +49,7 @@ public class EntitySearchUtils {
|
||||
|
||||
|
||||
public Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber,
|
||||
boolean local) {
|
||||
boolean local, boolean isDossierDictionary) {
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
|
||||
@ -77,7 +69,7 @@ public class EntitySearchUtils {
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local));
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
}
|
||||
@ -147,16 +139,16 @@ public class EntitySearchUtils {
|
||||
|
||||
public void addEntitiesWithHigherRank(Set<Entity> entities, Entity found, Dictionary dictionary) {
|
||||
|
||||
if(entities.contains(found)){
|
||||
if (entities.contains(found)) {
|
||||
Entity existing = entities.stream().filter(entity -> entity.equals(found)).findFirst().get();
|
||||
if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())){
|
||||
if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) {
|
||||
entities.remove(found);
|
||||
}
|
||||
}
|
||||
entities.add(found);
|
||||
}
|
||||
|
||||
public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found){
|
||||
public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found) {
|
||||
// HashSet keeps old value but we want the new.
|
||||
entities.removeAll(found);
|
||||
entities.addAll(found);
|
||||
|
||||
@ -1,15 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
@UtilityClass
|
||||
public class IdBuilder {
|
||||
|
||||
@ -26,14 +25,9 @@ public class IdBuilder {
|
||||
}
|
||||
|
||||
|
||||
public String buildId(Rectangle2D rectangle2D, int page){
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
|
||||
|
||||
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
|
||||
public String buildId(RedRectangle2D rectangle2D, int page) {
|
||||
return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
@ -8,8 +10,6 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ResourceLoader {
|
||||
|
||||
@ -27,4 +27,4 @@ public class ResourceLoader {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -7,6 +7,7 @@ public class TextNormalizationUtilities {
|
||||
|
||||
/**
|
||||
* Revert hyphenation due to line breaks.
|
||||
*
|
||||
* @param text Text to be processed.
|
||||
* @return Text without line-break hyphenation.
|
||||
*/
|
||||
@ -14,4 +15,4 @@ public class TextNormalizationUtilities {
|
||||
return text.replaceAll("([^\\s\\d\\-]{2,})[\\-\\u00AD]\\R|\n\r(.+ )", "$1$2");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,88 +1,275 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.Graphics;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class PdfSegmentationService {
|
||||
|
||||
private final static int MAX_PAGES_BEFORE_GC = 250;
|
||||
|
||||
private final RulingCleaningService rulingCleaningService;
|
||||
private final TableExtractionService tableExtractionService;
|
||||
private final BlockificationService blockificationService;
|
||||
private final ClassificationService classificationService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
|
||||
|
||||
public Document parseDocument(PDDocument pdDocument) throws IOException {
|
||||
public Document parseDocument(InputStream documentInputStream) throws IOException {
|
||||
return parseDocument(documentInputStream, false);
|
||||
}
|
||||
|
||||
Document document = new Document();
|
||||
public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException {
|
||||
PDDocument pdDocument = null;
|
||||
try {
|
||||
//create tempFile
|
||||
File tempFile = File.createTempFile("document", ".pdf");
|
||||
IOUtils.copy(documentInputStream, new FileOutputStream(tempFile));
|
||||
|
||||
List<Page> pages = new ArrayList<>();
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
for (int pageNumber = 1; pageNumber <= pdDocument.getNumberOfPages(); pageNumber++) {
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
// initialize required variables
|
||||
Document document = new Document();
|
||||
List<Page> pages = new ArrayList<>();
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight();
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isRotated = rotation != 0 && rotation != 360;
|
||||
pdDocument = reinitializePDDocument(tempFile, null);
|
||||
long pageCount = pdDocument.getNumberOfPages();
|
||||
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper
|
||||
.getMaxCharHeight());
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
page.setRotation(rotation);
|
||||
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, page);
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
buildPageStatistics(page);
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight();
|
||||
|
||||
page.setLandscape(isLandscape || isRotated);
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isRotated = rotation != 0 && rotation != 360;
|
||||
|
||||
page.setPageNumber(pageNumber);
|
||||
increaseDocumentStatistics(page, document);
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper
|
||||
.getMaxCharHeight());
|
||||
|
||||
page.setImages(stripper.getImages());
|
||||
pages.add(page);
|
||||
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
|
||||
page.setRotation(rotation);
|
||||
page.setLandscape(isLandscape || isRotated);
|
||||
page.setPageNumber(pageNumber);
|
||||
List<PdfImage> mergedList = processImages(stripper.getImages());
|
||||
page.setImages(mergedList);
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, page);
|
||||
buildPageStatistics(page);
|
||||
increaseDocumentStatistics(page, document);
|
||||
|
||||
|
||||
if (!ignoreImages) {
|
||||
imageClassificationService.classifyImages(page);
|
||||
}
|
||||
|
||||
pages.add(page);
|
||||
|
||||
|
||||
}
|
||||
|
||||
document.setPages(pages);
|
||||
|
||||
classificationService.classifyDocument(document);
|
||||
sectionsBuilderService.buildSections(document);
|
||||
sectionsBuilderService.addImagesToSections(document);
|
||||
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
|
||||
IOUtils.close(pdDocument);
|
||||
|
||||
tempFile.delete();
|
||||
|
||||
return document;
|
||||
} finally {
|
||||
if (pdDocument != null) {
|
||||
pdDocument.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException {
|
||||
if (pdDocument != null) {
|
||||
pdDocument.close();
|
||||
}
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
|
||||
MemoryStats.printMemoryStats();
|
||||
|
||||
var newPDDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupTempFileOnly());
|
||||
newPDDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
return newPDDocument;
|
||||
}
|
||||
|
||||
//merge images, if they are separated during pdf import, return new list of Pdfimages
|
||||
private List<PdfImage> processImages(List<PdfImage> imageList) {
|
||||
if (imageList.size() > 1) {
|
||||
List<PdfImage> mergedList = new ArrayList<>();
|
||||
int countElementsInList = 0;
|
||||
boolean beginImage = true;
|
||||
|
||||
// a List of Boolean, true = candidate for merging, false = no merging
|
||||
List<Boolean> candidatesList = getCandidatesList(imageList);
|
||||
|
||||
// loop through list, if there are candidates for merging (true), merge images and add it to mergedList
|
||||
for (int i = 0; i < candidatesList.size(); i++) {
|
||||
if (candidatesList.get(i)) {
|
||||
if (beginImage) {
|
||||
//begin of image, merge two parts of imageList
|
||||
PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1));
|
||||
// image merge successful
|
||||
if (mergedImage != null) {
|
||||
mergedList.add(mergedImage);
|
||||
countElementsInList++;
|
||||
}
|
||||
} else {
|
||||
//middle of an image, merge current piece auf mergedList with image of imageList
|
||||
PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1));
|
||||
// image merge successful
|
||||
if (mergedImage != null) {
|
||||
mergedList.set(countElementsInList - 1, mergedImage);
|
||||
}
|
||||
}
|
||||
beginImage = false;
|
||||
} else {
|
||||
// if the last candidate is false, then both images i and i+1 must be added
|
||||
if (i == candidatesList.size() - 1) {
|
||||
if (countElementsInList > 0 && mergedList.get(countElementsInList - 1) == imageList.get(i)) {
|
||||
mergedList.add(imageList.get(i + 1));
|
||||
} else {
|
||||
mergedList.add(imageList.get(i));
|
||||
mergedList.add(imageList.get(i + 1));
|
||||
}
|
||||
} else {
|
||||
//first image is not splitted, add i to resultlist
|
||||
if (beginImage) {
|
||||
mergedList.add(imageList.get(i));
|
||||
countElementsInList++;
|
||||
} else {
|
||||
// i is the end of an image, add begin of new image
|
||||
mergedList.add(imageList.get(i + 1));
|
||||
countElementsInList++;
|
||||
beginImage = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return mergedList;
|
||||
} else {
|
||||
return imageList;
|
||||
}
|
||||
}
|
||||
|
||||
private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2) {
|
||||
|
||||
// diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten
|
||||
double width = image1.getPosition().getWidth();
|
||||
double height1 = image1.getPosition().getHeight();
|
||||
double height2 = image2.getPosition().getHeight();
|
||||
// mit den Werten, die unter Image gespeichert sind, funktioniert es
|
||||
double img1height = image1.getImage().getHeight();
|
||||
double img1width = image1.getImage().getWidth();
|
||||
double img2height = image2.getImage().getHeight();
|
||||
|
||||
BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height + img2height), BufferedImage.TYPE_INT_RGB);
|
||||
Graphics mergedImageGraphics = mergedImage.getGraphics();
|
||||
try {
|
||||
mergedImageGraphics.drawImage(image1.getImage(), 0, 0, null);
|
||||
mergedImageGraphics.drawImage(image2.getImage(), 0, (int) (img1height), null);
|
||||
|
||||
// set Image, Position and type for merged Image
|
||||
//set position for merged image with values of image1 and the height of both
|
||||
Rectangle2D pos = new Rectangle2D.Float();
|
||||
pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), width, height1 + height2);
|
||||
PdfImage newPdfImage = new PdfImage(mergedImage, pos, image1.getPage());
|
||||
// Graphics need to be disposed
|
||||
|
||||
image1.getImage().flush();
|
||||
image2.getImage().flush();
|
||||
|
||||
mergedImage.flush();
|
||||
mergedImageGraphics.dispose();
|
||||
|
||||
return newPdfImage;
|
||||
} catch (Exception e) {
|
||||
// failed to merge image
|
||||
log.error("Failed to merge image", e);
|
||||
return null;
|
||||
}
|
||||
|
||||
document.setPages(pages);
|
||||
|
||||
classificationService.classifyDocument(document);
|
||||
sectionsBuilderService.buildSections(document);
|
||||
sectionsBuilderService.addImagesToSections(document);
|
||||
}
|
||||
|
||||
return document;
|
||||
//make a list of true and false, if the image is a candidate for merging
|
||||
private List<Boolean> getCandidatesList(List<PdfImage> imageList) {
|
||||
List<Boolean> candidatesList = new ArrayList<>();
|
||||
for (int i = 0; i < imageList.size(); i++) {
|
||||
if (i >= 1) {
|
||||
candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i)));
|
||||
}
|
||||
}
|
||||
return candidatesList;
|
||||
}
|
||||
|
||||
// evaluate if two images are candidates for merging, depending on their coordinates, width and height
|
||||
private boolean isCandidateForMerging(PdfImage image1, PdfImage image2) {
|
||||
double x1 = image1.getPosition().getX();
|
||||
double y1 = image1.getPosition().getY();
|
||||
double width1 = image1.getPosition().getWidth();
|
||||
double x2 = image2.getPosition().getX();
|
||||
double y2 = image2.getPosition().getY();
|
||||
double width2 = image2.getPosition().getWidth();
|
||||
double height2 = image2.getPosition().getHeight();
|
||||
//if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates,
|
||||
// then it is the same picture and has to be merged -> return true
|
||||
return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(y1 - y2) && width2 > (height2 / 6);
|
||||
}
|
||||
|
||||
|
||||
@ -116,4 +303,5 @@ public class PdfSegmentationService {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,29 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
public class SectionsBuilderService {
|
||||
@ -121,6 +107,20 @@ public class SectionsBuilderService {
|
||||
}
|
||||
}
|
||||
|
||||
if (paragraphMap.isEmpty()) {
|
||||
Paragraph paragraph = new Paragraph();
|
||||
document.getParagraphs().add(paragraph);
|
||||
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph);
|
||||
}
|
||||
|
||||
// first page is always a paragraph, else we can't process pages 1..N,
|
||||
// where N is the first found page with a paragraph
|
||||
if (paragraphMap.get(1) == null) {
|
||||
Paragraph paragraph = new Paragraph();
|
||||
document.getParagraphs().add(paragraph);
|
||||
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph);
|
||||
}
|
||||
|
||||
for (Page page : document.getPages()) {
|
||||
for (PdfImage image : page.getImages()) {
|
||||
SortedSet<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
|
||||
@ -296,4 +296,4 @@ public class SectionsBuilderService {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,17 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.settings;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
import lombok.Data;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
@Data
|
||||
@ConfigurationProperties("redaction-service")
|
||||
public class RedactionServiceSettings {
|
||||
|
||||
|
||||
private int numberOfSurroundingWords = 3;
|
||||
|
||||
private int surroundingWordsOffsetWindow = 100;
|
||||
|
||||
private boolean enableImageClassification = true;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,103 @@
|
||||
package com.iqser.red.service.redaction.v1.server.storage;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
|
||||
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.core.io.InputStreamResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class RedactionStorageService {
|
||||
|
||||
private final ObjectMapper objectMapper;
|
||||
private final StorageService storageService;
|
||||
|
||||
@SneakyThrows
|
||||
public InputStream getStoredObject(String storageId) {
|
||||
return storageService.getObject(storageId).getInputStream();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void storeObject(String projectId, String fileId, FileType fileType, Object any) {
|
||||
storageService.storeObject(StorageIdUtils.getStorageId(projectId, fileId, fileType), objectMapper.writeValueAsBytes(any));
|
||||
}
|
||||
|
||||
|
||||
public RedactionLog getRedactionLog(String projectId, String fileId) {
|
||||
|
||||
InputStreamResource inputStreamResource;
|
||||
try {
|
||||
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.REDACTION_LOG));
|
||||
} catch (StorageObjectDoesNotExist e) {
|
||||
log.debug("Text not available.");
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Could not convert RedactionLog", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Text getText(String projectId, String fileId) {
|
||||
|
||||
InputStreamResource inputStreamResource;
|
||||
try {
|
||||
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.TEXT));
|
||||
} catch (StorageObjectDoesNotExist e) {
|
||||
log.debug("Text not available.");
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
return objectMapper.readValue(inputStreamResource.getInputStream(), Text.class);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Could not convert Text", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public SectionGrid getSectionGrid(String projectId, String fileId) {
|
||||
|
||||
var sectionGrid = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.SECTION_GRID));
|
||||
try {
|
||||
return objectMapper.readValue(sectionGrid.getInputStream(), SectionGrid.class);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Could not convert RedactionLog", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public enum StorageType {
|
||||
PARSED_DOCUMENT(".json");
|
||||
|
||||
@Getter
|
||||
private final String extension;
|
||||
|
||||
}
|
||||
|
||||
public static class StorageIdUtils {
|
||||
|
||||
public static String getStorageId(String projectId, String fileId, FileType fileType) {
|
||||
return projectId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@ -1,7 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -25,15 +25,17 @@ public abstract class AbstractTextContainer {
|
||||
}
|
||||
|
||||
public boolean contains(Rectangle other) {
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,18 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@ -71,7 +70,4 @@ public class Cell extends Rectangle {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class CleanRulings {
|
||||
|
||||
@ -8,170 +8,171 @@ import java.util.List;
|
||||
@SuppressWarnings("all")
|
||||
public class Rectangle extends Rectangle2D.Float {
|
||||
|
||||
/**
|
||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
||||
*
|
||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
||||
* @deprecated with no replacement
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
||||
@Override public int compare(Rectangle o1, Rectangle o2) {
|
||||
if (o1.equals(o2)) return 0;
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
|
||||
? - java.lang.Double.compare(o1.getX(), o2.getX())
|
||||
: java.lang.Double.compare(o1.getX(), o2.getX());
|
||||
} else {
|
||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
/**
|
||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
||||
* <p>
|
||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
||||
*
|
||||
* @deprecated with no replacement
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
||||
@Override
|
||||
public int compare(Rectangle o1, Rectangle o2) {
|
||||
if (o1.equals(o2)) return 0;
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
|
||||
? -java.lang.Double.compare(o1.getX(), o2.getX())
|
||||
: java.lang.Double.compare(o1.getX(), o2.getX());
|
||||
} else {
|
||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
public Rectangle() {
|
||||
super();
|
||||
}
|
||||
public Rectangle() {
|
||||
super();
|
||||
}
|
||||
|
||||
public Rectangle(float top, float left, float width, float height) {
|
||||
super();
|
||||
this.setRect(left, top, width, height);
|
||||
}
|
||||
public Rectangle(float top, float left, float width, float height) {
|
||||
super();
|
||||
this.setRect(left, top, width, height);
|
||||
}
|
||||
|
||||
public int compareTo(Rectangle other) {
|
||||
return ILL_DEFINED_ORDER.compare(this, other);
|
||||
}
|
||||
/**
|
||||
* @param rectangles
|
||||
* @return minimum bounding box that contains all the rectangles
|
||||
*/
|
||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
||||
float minx = java.lang.Float.MAX_VALUE;
|
||||
float miny = java.lang.Float.MAX_VALUE;
|
||||
float maxx = java.lang.Float.MIN_VALUE;
|
||||
float maxy = java.lang.Float.MIN_VALUE;
|
||||
|
||||
// I'm bad at Java and need this for fancy sorting in
|
||||
// technology.tabula.TextChunk.
|
||||
public int isLtrDominant() {
|
||||
return 0;
|
||||
}
|
||||
for (Rectangle r : rectangles) {
|
||||
minx = (float) Math.min(r.getMinX(), minx);
|
||||
miny = (float) Math.min(r.getMinY(), miny);
|
||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
||||
}
|
||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
||||
}
|
||||
|
||||
public float getArea() {
|
||||
return this.width * this.height;
|
||||
}
|
||||
public int compareTo(Rectangle other) {
|
||||
return ILL_DEFINED_ORDER.compare(this, other);
|
||||
}
|
||||
|
||||
public float verticalOverlap(Rectangle other) {
|
||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
}
|
||||
// I'm bad at Java and need this for fancy sorting in
|
||||
// technology.tabula.TextChunk.
|
||||
public int isLtrDominant() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public boolean verticallyOverlaps(Rectangle other) {
|
||||
return verticalOverlap(other) > 0;
|
||||
}
|
||||
public float getArea() {
|
||||
return this.width * this.height;
|
||||
}
|
||||
|
||||
public float horizontalOverlap(Rectangle other) {
|
||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
}
|
||||
public float verticalOverlap(Rectangle other) {
|
||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
}
|
||||
|
||||
public boolean horizontallyOverlaps(Rectangle other) {
|
||||
return horizontalOverlap(other) > 0;
|
||||
}
|
||||
public boolean verticallyOverlaps(Rectangle other) {
|
||||
return verticalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
public float verticalOverlapRatio(Rectangle other) {
|
||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
||||
public float horizontalOverlap(Rectangle other) {
|
||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
}
|
||||
|
||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
|
||||
&& other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - this.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
|
||||
&& this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - other.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
|
||||
&& other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - other.getTop()) / delta;
|
||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
|
||||
&& this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - this.getTop()) / delta;
|
||||
}
|
||||
public boolean horizontallyOverlaps(Rectangle other) {
|
||||
return horizontalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
return rv;
|
||||
public float verticalOverlapRatio(Rectangle other) {
|
||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
||||
|
||||
}
|
||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
|
||||
&& other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - this.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
|
||||
&& this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - other.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
|
||||
&& other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - other.getTop()) / delta;
|
||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
|
||||
&& this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - this.getTop()) / delta;
|
||||
}
|
||||
|
||||
public float overlapRatio(Rectangle other) {
|
||||
double intersectionWidth = Math.max(0,
|
||||
Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
double intersectionHeight = Math.max(0,
|
||||
Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
||||
return rv;
|
||||
|
||||
return (float) (intersectionArea / unionArea);
|
||||
}
|
||||
}
|
||||
|
||||
public Rectangle merge(Rectangle other) {
|
||||
this.setRect(this.createUnion(other));
|
||||
return this;
|
||||
}
|
||||
public float overlapRatio(Rectangle other) {
|
||||
double intersectionWidth = Math.max(0,
|
||||
Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
double intersectionHeight = Math.max(0,
|
||||
Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
||||
|
||||
public float getTop() {
|
||||
return (float) this.getMinY();
|
||||
}
|
||||
return (float) (intersectionArea / unionArea);
|
||||
}
|
||||
|
||||
public void setTop(float top) {
|
||||
float deltaHeight = top - this.y;
|
||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
||||
}
|
||||
public Rectangle merge(Rectangle other) {
|
||||
this.setRect(this.createUnion(other));
|
||||
return this;
|
||||
}
|
||||
|
||||
public float getRight() {
|
||||
return (float) this.getMaxX();
|
||||
}
|
||||
public float getTop() {
|
||||
return (float) this.getMinY();
|
||||
}
|
||||
|
||||
public void setRight(float right) {
|
||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
||||
}
|
||||
public void setTop(float top) {
|
||||
float deltaHeight = top - this.y;
|
||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
||||
}
|
||||
|
||||
public float getLeft() {
|
||||
return (float) this.getMinX();
|
||||
}
|
||||
public float getRight() {
|
||||
return (float) this.getMaxX();
|
||||
}
|
||||
|
||||
public void setLeft(float left) {
|
||||
float deltaWidth = left - this.x;
|
||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
||||
}
|
||||
public void setRight(float right) {
|
||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
||||
}
|
||||
|
||||
public float getBottom() {
|
||||
return (float) this.getMaxY();
|
||||
}
|
||||
public float getLeft() {
|
||||
return (float) this.getMinX();
|
||||
}
|
||||
|
||||
public void setBottom(float bottom) {
|
||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
||||
}
|
||||
public void setLeft(float left) {
|
||||
float deltaWidth = left - this.x;
|
||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
||||
}
|
||||
|
||||
public Point2D[] getPoints() {
|
||||
return new Point2D[] { new Point2D.Float(this.getLeft(), this.getTop()),
|
||||
new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
|
||||
new Point2D.Float(this.getLeft(), this.getBottom()) };
|
||||
}
|
||||
public float getBottom() {
|
||||
return (float) this.getMaxY();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = super.toString();
|
||||
sb.append(s.substring(0, s.length() - 1));
|
||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
||||
return sb.toString();
|
||||
}
|
||||
public void setBottom(float bottom) {
|
||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param rectangles
|
||||
* @return minimum bounding box that contains all the rectangles
|
||||
*/
|
||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
||||
float minx = java.lang.Float.MAX_VALUE;
|
||||
float miny = java.lang.Float.MAX_VALUE;
|
||||
float maxx = java.lang.Float.MIN_VALUE;
|
||||
float maxy = java.lang.Float.MIN_VALUE;
|
||||
public Point2D[] getPoints() {
|
||||
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()),
|
||||
new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
|
||||
new Point2D.Float(this.getLeft(), this.getBottom())};
|
||||
}
|
||||
|
||||
for (Rectangle r : rectangles) {
|
||||
minx = (float) Math.min(r.getMinX(), minx);
|
||||
miny = (float) Math.min(r.getMinY(), miny);
|
||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
||||
}
|
||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = super.toString();
|
||||
sb.append(s.substring(0, s.length() - 1));
|
||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,12 +1,11 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import org.locationtech.jts.geom.Envelope;
|
||||
import org.locationtech.jts.index.strtree.STRtree;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
public class RectangleSpatialIndex<T extends Rectangle> {
|
||||
|
||||
@ -1,20 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Formatter;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
@Slf4j
|
||||
@ -23,13 +16,127 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
|
||||
|
||||
private enum SOType {VERTICAL, HRIGHT, HLEFT}
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
super(p1, p2);
|
||||
}
|
||||
|
||||
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
|
||||
ArrayList<Ruling> rv = new ArrayList<>();
|
||||
for (Ruling r : rulings) {
|
||||
if (r.intersects(area)) {
|
||||
rv.add(r.intersect(area));
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
// log(n) implementation of find_intersections
|
||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
class SortObject {
|
||||
protected SOType type;
|
||||
protected float position;
|
||||
protected Ruling ruling;
|
||||
|
||||
public SortObject(SOType type, float position, Ruling ruling) {
|
||||
this.type = type;
|
||||
this.position = position;
|
||||
this.ruling = ruling;
|
||||
}
|
||||
}
|
||||
|
||||
List<SortObject> sos = new ArrayList<>();
|
||||
|
||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
||||
@Override
|
||||
public int compare(Ruling o1, Ruling o2) {
|
||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
||||
}
|
||||
});
|
||||
|
||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
||||
@Override
|
||||
public int compare(Point2D o1, Point2D o2) {
|
||||
if (o1.getY() > o2.getY()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getY() < o2.getY()) {
|
||||
return -1;
|
||||
}
|
||||
if (o1.getX() > o2.getX()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getX() < o2.getX()) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
for (Ruling h : horizontals) {
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
}
|
||||
|
||||
for (Ruling v : verticals) {
|
||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
||||
}
|
||||
|
||||
Collections.sort(sos, new Comparator<SortObject>() {
|
||||
@Override
|
||||
public int compare(SortObject a, SortObject b) {
|
||||
int rv;
|
||||
if (Utils.feq(a.position, b.position)) {
|
||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
||||
rv = 1;
|
||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
||||
rv = 1;
|
||||
} else {
|
||||
rv = java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
} else {
|
||||
return java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
});
|
||||
|
||||
for (SortObject so : sos) {
|
||||
switch (so.type) {
|
||||
case VERTICAL:
|
||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
||||
try {
|
||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
||||
if (i == null) {
|
||||
continue;
|
||||
}
|
||||
rv.put(i,
|
||||
new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
|
||||
so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.info("Some line are oblique, ignoring...");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case HRIGHT:
|
||||
tree.remove(so.ruling);
|
||||
break;
|
||||
case HLEFT:
|
||||
tree.put(so.ruling, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
|
||||
public boolean vertical() {
|
||||
return this.length() > 0 && Utils.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
@ -38,13 +145,13 @@ public class Ruling extends Line2D.Float {
|
||||
return this.length() > 0 && Utils.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
|
||||
// attributes that make sense only for non-oblique lines
|
||||
// these are used to have a single collapse method (in page, currently)
|
||||
|
||||
public boolean oblique() {
|
||||
return !(this.vertical() || this.horizontal());
|
||||
}
|
||||
|
||||
// attributes that make sense only for non-oblique lines
|
||||
// these are used to have a single collapse method (in page, currently)
|
||||
|
||||
public float getPosition() {
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
@ -52,7 +159,6 @@ public class Ruling extends Line2D.Float {
|
||||
return this.vertical() ? this.getLeft() : this.getTop();
|
||||
}
|
||||
|
||||
|
||||
public float getStart() {
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
@ -102,12 +208,10 @@ public class Ruling extends Line2D.Float {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean perpendicularTo(Ruling other) {
|
||||
return this.vertical() == other.horizontal();
|
||||
}
|
||||
|
||||
|
||||
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
|
||||
if (this.intersectsLine(another)) {
|
||||
return true;
|
||||
@ -238,7 +342,6 @@ public class Ruling extends Line2D.Float {
|
||||
return angle;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
@ -248,122 +351,7 @@ public class Ruling extends Line2D.Float {
|
||||
return rv;
|
||||
}
|
||||
|
||||
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
|
||||
ArrayList<Ruling> rv = new ArrayList<>();
|
||||
for (Ruling r : rulings) {
|
||||
if (r.intersects(area)) {
|
||||
rv.add(r.intersect(area));
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
// log(n) implementation of find_intersections
|
||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
class SortObject {
|
||||
protected SOType type;
|
||||
protected float position;
|
||||
protected Ruling ruling;
|
||||
|
||||
public SortObject(SOType type, float position, Ruling ruling) {
|
||||
this.type = type;
|
||||
this.position = position;
|
||||
this.ruling = ruling;
|
||||
}
|
||||
}
|
||||
|
||||
List<SortObject> sos = new ArrayList<>();
|
||||
|
||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
||||
@Override
|
||||
public int compare(Ruling o1, Ruling o2) {
|
||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
||||
}
|
||||
});
|
||||
|
||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
||||
@Override
|
||||
public int compare(Point2D o1, Point2D o2) {
|
||||
if (o1.getY() > o2.getY()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getY() < o2.getY()) {
|
||||
return -1;
|
||||
}
|
||||
if (o1.getX() > o2.getX()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getX() < o2.getX()) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
for (Ruling h : horizontals) {
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
}
|
||||
|
||||
for (Ruling v : verticals) {
|
||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
||||
}
|
||||
|
||||
Collections.sort(sos, new Comparator<SortObject>() {
|
||||
@Override
|
||||
public int compare(SortObject a, SortObject b) {
|
||||
int rv;
|
||||
if (Utils.feq(a.position, b.position)) {
|
||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
||||
rv = 1;
|
||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
||||
rv = 1;
|
||||
} else {
|
||||
rv = java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
} else {
|
||||
return java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
});
|
||||
|
||||
for (SortObject so : sos) {
|
||||
switch (so.type) {
|
||||
case VERTICAL:
|
||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
||||
try {
|
||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
||||
if (i == null) {
|
||||
continue;
|
||||
}
|
||||
rv.put(i,
|
||||
new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
|
||||
so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
||||
} catch(UnsupportedOperationException e){
|
||||
log.info("Some line are oblique, ignoring...");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case HRIGHT:
|
||||
tree.remove(so.ruling);
|
||||
break;
|
||||
case HLEFT:
|
||||
tree.put(so.ruling, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
private enum SOType {VERTICAL, HRIGHT, HLEFT}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,22 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@Slf4j
|
||||
public class Table extends AbstractTextContainer {
|
||||
@ -24,21 +15,14 @@ public class Table extends AbstractTextContainer {
|
||||
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
||||
|
||||
private final RectangleSpatialIndex<Cell> si = new RectangleSpatialIndex<>();
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@Setter
|
||||
private String headline;
|
||||
|
||||
private int unrotatedRowCount;
|
||||
|
||||
private int unrotatedColCount;
|
||||
|
||||
private int rowCount = -1;
|
||||
|
||||
private int colCount = -1;
|
||||
|
||||
private final int rotation;
|
||||
|
||||
private List<List<Cell>> rows;
|
||||
|
||||
|
||||
@ -62,8 +46,8 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
// Ignore rows that does not contain any cells and values.
|
||||
List<List<Cell>> rowsToRemove = new ArrayList<>();
|
||||
for (List<Cell> row: rows){
|
||||
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()){
|
||||
for (List<Cell> row : rows) {
|
||||
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
|
||||
rowsToRemove.add(row);
|
||||
}
|
||||
}
|
||||
@ -110,7 +94,7 @@ public class Table extends AbstractTextContainer {
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
if(rowCells.size() == 1){
|
||||
if (rowCells.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -275,7 +259,7 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2),
|
||||
Utils.round(arg1
|
||||
.getBottom(), 2))));
|
||||
.getBottom(), 2))));
|
||||
|
||||
Iterator<Cell> iter = cells.iterator();
|
||||
Cell c = iter.next();
|
||||
@ -367,4 +351,4 @@ public class Table extends AbstractTextContainer {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,19 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.*;
|
||||
|
||||
@Service
|
||||
public class RulingCleaningService {
|
||||
|
||||
@ -1,31 +1,57 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = Utils.round(arg0.getX(), 2);
|
||||
float arg0Y = Utils.round(arg0.getY(), 2);
|
||||
float arg1X = Utils.round(arg1.getX(), 2);
|
||||
float arg1Y = Utils.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
} else if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = Utils.round(arg0.getX(), 2);
|
||||
float arg0Y = Utils.round(arg0.getY(), 2);
|
||||
float arg1X = Utils.round(arg1.getX(), 2);
|
||||
float arg1Y = Utils.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
} else if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
public void extractTables(CleanRulings cleanRulings, Page page) {
|
||||
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
@ -80,7 +106,6 @@ public class TableExtractionService {
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
|
||||
|
||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<Cell> cellsFound = new ArrayList<>();
|
||||
@ -133,7 +158,6 @@ public class TableExtractionService {
|
||||
return cellsFound;
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
@ -233,47 +257,6 @@ public class TableExtractionService {
|
||||
return rectangles;
|
||||
}
|
||||
|
||||
|
||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = Utils.round(arg0.getX(), 2);
|
||||
float arg0Y = Utils.round(arg0.getY(), 2);
|
||||
float arg1X = Utils.round(arg1.getX(), 2);
|
||||
float arg1Y = Utils.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
} else if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = Utils.round(arg0.getX(), 2);
|
||||
float arg0Y = Utils.round(arg0.getY(), 2);
|
||||
float arg1X = Utils.round(arg1.getX(), 2);
|
||||
float arg1Y = Utils.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
} else if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
private enum Direction {
|
||||
HORIZONTAL, VERTICAL
|
||||
}
|
||||
|
||||
@ -19,21 +19,24 @@ import java.awt.geom.Rectangle2D;
|
||||
* clipping algorithm (line against clip rectangle).
|
||||
*/
|
||||
@SuppressWarnings("all")
|
||||
public final class CohenSutherlandClipping
|
||||
{
|
||||
public final class CohenSutherlandClipping {
|
||||
private static final int INSIDE = 0;
|
||||
private static final int LEFT = 1;
|
||||
private static final int RIGHT = 2;
|
||||
private static final int BOTTOM = 4;
|
||||
private static final int TOP = 8;
|
||||
private double xMin;
|
||||
private double yMin;
|
||||
private double xMax;
|
||||
private double yMax;
|
||||
|
||||
/**
|
||||
* Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0).
|
||||
*/
|
||||
public CohenSutherlandClipping() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a Cohen Sutherland clipper with the given clip rectangle.
|
||||
*
|
||||
* @param clip the clip rectangle to use
|
||||
*/
|
||||
public CohenSutherlandClipping(Rectangle2D clip) {
|
||||
@ -42,6 +45,7 @@ public final class CohenSutherlandClipping
|
||||
|
||||
/**
|
||||
* Sets the clip rectangle.
|
||||
*
|
||||
* @param clip the clip rectangle
|
||||
*/
|
||||
public void setClip(Rectangle2D clip) {
|
||||
@ -51,19 +55,13 @@ public final class CohenSutherlandClipping
|
||||
yMax = yMin + clip.getHeight();
|
||||
}
|
||||
|
||||
private static final int INSIDE = 0;
|
||||
private static final int LEFT = 1;
|
||||
private static final int RIGHT = 2;
|
||||
private static final int BOTTOM = 4;
|
||||
private static final int TOP = 8;
|
||||
|
||||
private final int regionCode(double x, double y) {
|
||||
int code = x < xMin
|
||||
? LEFT
|
||||
: x > xMax
|
||||
int code = x < xMin
|
||||
? LEFT
|
||||
: x > xMax
|
||||
? RIGHT
|
||||
: INSIDE;
|
||||
if (y < yMin) code |= BOTTOM;
|
||||
if (y < yMin) code |= BOTTOM;
|
||||
else if (y > yMax) code |= TOP;
|
||||
return code;
|
||||
}
|
||||
@ -71,6 +69,7 @@ public final class CohenSutherlandClipping
|
||||
/**
|
||||
* Clips a given line against the clip rectangle.
|
||||
* The modification (if needed) is done in place.
|
||||
*
|
||||
* @param line the line to clip
|
||||
* @return true if line is clipped, false if line is
|
||||
* totally outside the clip rect.
|
||||
@ -87,9 +86,9 @@ public final class CohenSutherlandClipping
|
||||
|
||||
boolean vertical = p1x == p2x;
|
||||
|
||||
double slope = vertical
|
||||
? 0d
|
||||
: (p2y-p1y)/(p2x-p1x);
|
||||
double slope = vertical
|
||||
? 0d
|
||||
: (p2y - p1y) / (p2x - p1x);
|
||||
|
||||
int c1 = regionCode(p1x, p1y);
|
||||
int c2 = regionCode(p2x, p2y);
|
||||
@ -103,31 +102,27 @@ public final class CohenSutherlandClipping
|
||||
|
||||
if ((c & LEFT) != INSIDE) {
|
||||
qx = xMin;
|
||||
qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
|
||||
}
|
||||
else if ((c & RIGHT) != INSIDE) {
|
||||
qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
|
||||
} else if ((c & RIGHT) != INSIDE) {
|
||||
qx = xMax;
|
||||
qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
|
||||
}
|
||||
else if ((c & BOTTOM) != INSIDE) {
|
||||
qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
|
||||
} else if ((c & BOTTOM) != INSIDE) {
|
||||
qy = yMin;
|
||||
qx = vertical
|
||||
? p1x
|
||||
: (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
|
||||
}
|
||||
else if ((c & TOP) != INSIDE) {
|
||||
? p1x
|
||||
: (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
|
||||
} else if ((c & TOP) != INSIDE) {
|
||||
qy = yMax;
|
||||
qx = vertical
|
||||
? p1x
|
||||
: (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
|
||||
? p1x
|
||||
: (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
|
||||
}
|
||||
|
||||
if (c == c1) {
|
||||
p1x = qx;
|
||||
p1y = qy;
|
||||
c1 = regionCode(p1x, p1y);
|
||||
}
|
||||
else {
|
||||
c1 = regionCode(p1x, p1y);
|
||||
} else {
|
||||
p2x = qx;
|
||||
p2y = qy;
|
||||
c2 = regionCode(p2x, p2y);
|
||||
@ -137,4 +132,4 @@ public final class CohenSutherlandClipping
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// end of file
|
||||
// end of file
|
||||
|
||||
@ -10,11 +10,6 @@ import java.util.List;
|
||||
*/
|
||||
public final class QuickSort {
|
||||
|
||||
private QuickSort() {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static final Comparator<? extends Comparable> OBJCOMP = new Comparator<Comparable>() {
|
||||
@Override
|
||||
public int compare(Comparable object1, Comparable object2) {
|
||||
@ -24,6 +19,10 @@ public final class QuickSort {
|
||||
};
|
||||
|
||||
|
||||
private QuickSort() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Sorts the given list using the given comparator.
|
||||
*
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@SuppressWarnings("all")
|
||||
public class Utils {
|
||||
|
||||
@ -1,15 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.visualization.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
@ -17,9 +7,17 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -34,7 +32,7 @@ public class PdfVisualisationService {
|
||||
PDPage pdPage = document.getPage(page - 1);
|
||||
PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
for(Paragraph paragraph : classifiedDoc.getParagraphs()) {
|
||||
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
|
||||
|
||||
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
|
||||
|
||||
@ -44,10 +42,10 @@ public class PdfVisualisationService {
|
||||
continue;
|
||||
}
|
||||
if (textBlock instanceof TextBlock) {
|
||||
textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size());
|
||||
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
|
||||
visualizeTextBlock((TextBlock) textBlock, contentStream);
|
||||
} else if (textBlock instanceof Table) {
|
||||
textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size());
|
||||
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
|
||||
visualizeTable((Table) textBlock, contentStream);
|
||||
}
|
||||
|
||||
@ -59,7 +57,6 @@ public class PdfVisualisationService {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void visualizeClassifications(Document classifiedDoc, PDDocument document) throws IOException {
|
||||
|
||||
for (int page = 1; page <= document.getNumberOfPages(); page++) {
|
||||
|
||||
@ -1,4 +1,11 @@
|
||||
server:
|
||||
port: 8083
|
||||
|
||||
configuration-service.url: "http://localhost:8081"
|
||||
configuration-service.url: "http://localhost:8081"
|
||||
file-management-service.url: "http://localhost:8085"
|
||||
|
||||
storage:
|
||||
bucket-name: 'redaction'
|
||||
endpoint: 'http://localhost:9000'
|
||||
key: minioadmin
|
||||
secret: minioadmin
|
||||
|
||||
@ -2,6 +2,7 @@ info:
|
||||
description: Redaction Service Server V1
|
||||
|
||||
configuration-service.url: "http://configuration-service-v1:8080"
|
||||
file-management-service.url: "http://file-management-service-v1:8080"
|
||||
image-service.url: "http://image-service-v1:8080"
|
||||
|
||||
server:
|
||||
@ -10,6 +11,20 @@ server:
|
||||
spring:
|
||||
profiles:
|
||||
active: kubernetes
|
||||
rabbitmq:
|
||||
host: ${RABBITMQ_HOST:localhost}
|
||||
port: ${RABBITMQ_PORT:5672}
|
||||
username: ${RABBITMQ_USERNAME:user}
|
||||
password: ${RABBITMQ_PASSWORD:rabbitmq}
|
||||
listener:
|
||||
simple:
|
||||
acknowledge-mode: AUTO
|
||||
concurrency: 2
|
||||
retry:
|
||||
enabled: true
|
||||
max-attempts: 3
|
||||
max-interval: 15000
|
||||
prefetch: 1
|
||||
|
||||
management:
|
||||
endpoint:
|
||||
@ -17,4 +32,11 @@ management:
|
||||
prometheus.enabled: ${monitoring.enabled:false}
|
||||
health.enabled: true
|
||||
endpoints.web.exposure.include: prometheus, health
|
||||
metrics.export.prometheus.enabled: ${monitoring.enabled:false}
|
||||
metrics.export.prometheus.enabled: ${monitoring.enabled:false}
|
||||
|
||||
|
||||
storage:
|
||||
signer-type: 'AWSS3V4SignerType'
|
||||
bucket-name: 'redaction'
|
||||
region: 'us-east-1'
|
||||
endpoint: 'https://s3.amazonaws.com'
|
||||
|
||||
@ -0,0 +1,51 @@
|
||||
package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.springframework.core.io.InputStreamResource;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class FileSystemBackedStorageService extends StorageService {
|
||||
|
||||
private final Map<String, File> dataMap = new HashMap<>();
|
||||
|
||||
public FileSystemBackedStorageService() {
|
||||
super(null, null);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public InputStreamResource getObject(String objectId) {
|
||||
|
||||
var res = dataMap.get(objectId);
|
||||
if (res == null) {
|
||||
throw new StorageObjectDoesNotExist(new RuntimeException());
|
||||
}
|
||||
return new InputStreamResource(new FileInputStream(res));
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void storeObject(String objectId, byte[] data) {
|
||||
File tempFile = File.createTempFile("test", ".tmp");
|
||||
|
||||
IOUtils.write(data, new FileOutputStream(tempFile));
|
||||
|
||||
dataMap.put(objectId, tempFile);
|
||||
}
|
||||
|
||||
public void clearStorage() {
|
||||
this.dataMap.forEach((k, v) -> {
|
||||
v.delete();
|
||||
});
|
||||
this.dataMap.clear();
|
||||
}
|
||||
}
|
||||
@ -1,30 +1,27 @@
|
||||
package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.amazonaws.services.s3.AmazonS3;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.configuration.v1.api.model.*;
|
||||
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.kie.api.KieServices;
|
||||
@ -32,48 +29,32 @@ import org.kie.api.builder.KieBuilder;
|
||||
import org.kie.api.builder.KieFileSystem;
|
||||
import org.kie.api.builder.KieModule;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.context.TestConfiguration;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.configuration.v1.api.model.Colors;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
|
||||
import com.iqser.red.service.redaction.v1.model.Comment;
|
||||
import com.iqser.red.service.redaction.v1.model.IdRemoval;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.model.Status;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest(webEnvironment = RANDOM_PORT)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class)
|
||||
public class RedactionIntegrationTest {
|
||||
|
||||
private static final String RULES = loadFromClassPath("drools/rules.drl");
|
||||
@ -93,6 +74,7 @@ public class RedactionIntegrationTest {
|
||||
private static final String SIGNATURE = "signature";
|
||||
private static final String FORMULA = "formula";
|
||||
private static final String OCR = "ocr";
|
||||
private static final String DOSSIER_REDACTIONS = "dossier_redactions";
|
||||
|
||||
private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author";
|
||||
private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address";
|
||||
@ -101,9 +83,13 @@ public class RedactionIntegrationTest {
|
||||
|
||||
private static final String PII = "PII";
|
||||
|
||||
|
||||
@Autowired
|
||||
private RedactionController redactionController;
|
||||
|
||||
@Autowired
|
||||
private ReanalyzeService reanalyzeService;
|
||||
|
||||
@Autowired
|
||||
private ObjectMapper objectMapper;
|
||||
|
||||
@ -116,7 +102,20 @@ public class RedactionIntegrationTest {
|
||||
@MockBean
|
||||
private ImageClassificationClient imageClassificationClient;
|
||||
|
||||
@Autowired
|
||||
private RedactionStorageService redactionStorageService;
|
||||
|
||||
@Autowired
|
||||
private StorageService storageService;
|
||||
|
||||
@MockBean
|
||||
private AmazonS3 amazonS3;
|
||||
|
||||
@MockBean
|
||||
private RabbitTemplate rabbitTemplate;
|
||||
|
||||
private final Map<String, List<String>> dictionary = new HashMap<>();
|
||||
private final Map<String, List<String>> dossierDictionary = new HashMap<>();
|
||||
private final Map<String, String> typeColorMap = new HashMap<>();
|
||||
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
|
||||
private final Map<String, Boolean> caseInSensitiveMap = new HashMap<>();
|
||||
@ -126,8 +125,11 @@ public class RedactionIntegrationTest {
|
||||
private final Map<String, Long> reanlysisVersions = new HashMap<>();
|
||||
|
||||
private final static String TEST_RULESET_ID = "123";
|
||||
private final static String TEST_PROJECT_ID = "123";
|
||||
private final static String TEST_FILE_ID = "123";
|
||||
|
||||
@TestConfiguration
|
||||
@Configuration
|
||||
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
|
||||
public static class RedactionIntegrationTestConfiguration {
|
||||
|
||||
@Bean
|
||||
@ -146,6 +148,21 @@ public class RedactionIntegrationTest {
|
||||
return kieServices.newKieContainer(kieModule.getReleaseId());
|
||||
}
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
return new FileSystemBackedStorageService();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@After
|
||||
public void cleanupStorage() {
|
||||
if (this.storageService instanceof FileSystemBackedStorageService) {
|
||||
((FileSystemBackedStorageService) this.storageService).clearStorage();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -158,30 +175,45 @@ public class RedactionIntegrationTest {
|
||||
|
||||
loadDictionaryForTest();
|
||||
loadTypeForTest();
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(0L);
|
||||
when(dictionaryClient.getAllTypes(TEST_RULESET_ID)).thenReturn(TypeResponse.builder()
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(0L);
|
||||
when(dictionaryClient.getAllTypes(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(TypeResponse.builder()
|
||||
.types(getTypeResponse())
|
||||
.build());
|
||||
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(VERTEBRATE));
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(ADDRESS));
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(AUTHOR));
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(SPONSOR));
|
||||
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR));
|
||||
when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR));
|
||||
when(dictionaryClient.getDictionaryForType(HINT_ONLY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(HINT_ONLY));
|
||||
when(dictionaryClient.getDictionaryForType(MUST_REDACT, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(MUST_REDACT));
|
||||
when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION));
|
||||
when(dictionaryClient.getDictionaryForType(TEST_METHOD, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(TEST_METHOD));
|
||||
when(dictionaryClient.getDictionaryForType(PII, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PII));
|
||||
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR));
|
||||
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS));
|
||||
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE));
|
||||
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY));
|
||||
when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(IMAGE));
|
||||
when(dictionaryClient.getDictionaryForType(OCR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(OCR));
|
||||
when(dictionaryClient.getDictionaryForType(LOGO, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(LOGO));
|
||||
when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(SIGNATURE));
|
||||
when(dictionaryClient.getDictionaryForType(FORMULA, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FORMULA));
|
||||
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(0L);
|
||||
when(dictionaryClient.getAllTypes(TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(TypeResponse.builder()
|
||||
.types(List.of(TypeResult.builder()
|
||||
.type(DOSSIER_REDACTIONS)
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.hexColor( "#ffe187")
|
||||
.isHint(hintTypeMap.get(DOSSIER_REDACTIONS))
|
||||
.isCaseInsensitive(caseInSensitiveMap.get(DOSSIER_REDACTIONS))
|
||||
.isRecommendation(recommendationTypeMap.get(DOSSIER_REDACTIONS))
|
||||
.rank(rankTypeMap.get(DOSSIER_REDACTIONS))
|
||||
.build()))
|
||||
.build());
|
||||
|
||||
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false));
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(ADDRESS, false));
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(AUTHOR, false));
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SPONSOR, false));
|
||||
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR, false));
|
||||
when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR, false));
|
||||
when(dictionaryClient.getDictionaryForType(HINT_ONLY, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(HINT_ONLY, false));
|
||||
when(dictionaryClient.getDictionaryForType(MUST_REDACT, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(MUST_REDACT, false));
|
||||
when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION, false));
|
||||
when(dictionaryClient.getDictionaryForType(TEST_METHOD, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(TEST_METHOD, false));
|
||||
when(dictionaryClient.getDictionaryForType(PII, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PII, false));
|
||||
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR, false));
|
||||
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS, false));
|
||||
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FALSE_POSITIVE, false));
|
||||
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PURITY, false));
|
||||
when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(IMAGE, false));
|
||||
when(dictionaryClient.getDictionaryForType(OCR, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(OCR, false));
|
||||
when(dictionaryClient.getDictionaryForType(LOGO, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(LOGO, false));
|
||||
when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SIGNATURE, false));
|
||||
when(dictionaryClient.getDictionaryForType(FORMULA, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FORMULA, false));
|
||||
when(dictionaryClient.getDictionaryForType(DOSSIER_REDACTIONS, TEST_RULESET_ID, TEST_PROJECT_ID)).thenReturn(getDictionaryResponse(DOSSIER_REDACTIONS, true));
|
||||
when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors);
|
||||
}
|
||||
|
||||
@ -288,6 +320,11 @@ public class RedactionIntegrationTest {
|
||||
.stream()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
dossierDictionary.computeIfAbsent(DOSSIER_REDACTIONS, v -> new ArrayList<>())
|
||||
.addAll(ResourceLoader.load("dictionaries/dossier_redactions.txt")
|
||||
.stream()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
|
||||
@ -340,6 +377,7 @@ public class RedactionIntegrationTest {
|
||||
hintTypeMap.put(FORMULA, false);
|
||||
hintTypeMap.put(LOGO, false);
|
||||
hintTypeMap.put(SIGNATURE, false);
|
||||
hintTypeMap.put(DOSSIER_REDACTIONS, false);
|
||||
|
||||
caseInSensitiveMap.put(VERTEBRATE, true);
|
||||
caseInSensitiveMap.put(ADDRESS, false);
|
||||
@ -361,6 +399,7 @@ public class RedactionIntegrationTest {
|
||||
caseInSensitiveMap.put(SIGNATURE, true);
|
||||
caseInSensitiveMap.put(LOGO, true);
|
||||
caseInSensitiveMap.put(FORMULA, true);
|
||||
caseInSensitiveMap.put(DOSSIER_REDACTIONS, false);
|
||||
|
||||
recommendationTypeMap.put(VERTEBRATE, false);
|
||||
recommendationTypeMap.put(ADDRESS, false);
|
||||
@ -382,6 +421,7 @@ public class RedactionIntegrationTest {
|
||||
recommendationTypeMap.put(FORMULA, false);
|
||||
recommendationTypeMap.put(SIGNATURE, false);
|
||||
recommendationTypeMap.put(LOGO, false);
|
||||
recommendationTypeMap.put(DOSSIER_REDACTIONS, false);
|
||||
|
||||
rankTypeMap.put(FALSE_POSITIVE, 160);
|
||||
rankTypeMap.put(PURITY, 155);
|
||||
@ -403,6 +443,7 @@ public class RedactionIntegrationTest {
|
||||
rankTypeMap.put(LOGO, 28);
|
||||
rankTypeMap.put(SIGNATURE, 27);
|
||||
rankTypeMap.put(FORMULA, 26);
|
||||
rankTypeMap.put(DOSSIER_REDACTIONS, 200);
|
||||
|
||||
colors.setDefaultColor("#acfc00");
|
||||
colors.setNotRedacted("#cccccc");
|
||||
@ -429,11 +470,11 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
private DictionaryResponse getDictionaryResponse(String type) {
|
||||
private DictionaryResponse getDictionaryResponse(String type, boolean isDossierDictionary) {
|
||||
|
||||
return DictionaryResponse.builder()
|
||||
.hexColor(typeColorMap.get(type))
|
||||
.entries(toDictionaryEntry(dictionary.get(type)))
|
||||
.entries(isDossierDictionary ? toDictionaryEntry(dossierDictionary.get(type)) : toDictionaryEntry(dictionary.get(type)))
|
||||
.isHint(hintTypeMap.get(type))
|
||||
.isCaseInsensitive(caseInSensitiveMap.get(type))
|
||||
.isRecommendation(recommendationTypeMap.get(type))
|
||||
@ -453,6 +494,71 @@ public class RedactionIntegrationTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void test270Rotated() {
|
||||
AnalyzeRequest request = prepareStorage("files/Minimal Examples/270Rotated.pdf");
|
||||
MemoryStats.printMemoryStats();
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
assertThat(result).isNotNull();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void testLargeScannedFileOOM() {
|
||||
AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf");
|
||||
MemoryStats.printMemoryStats();
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
assertThat(result).isNotNull();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMergedImages() throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/merge_images.pdf");
|
||||
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
|
||||
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
|
||||
|
||||
redactionLog.getRedactionLogEntry().forEach(entry -> {
|
||||
duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry);
|
||||
});
|
||||
|
||||
duplicates.entrySet().forEach(entry -> {
|
||||
assertThat(entry.getValue().size()).isEqualTo(1);
|
||||
});
|
||||
|
||||
dictionary.get(AUTHOR).add("Drinking water");
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(1L);
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated3.pdf")) {
|
||||
fileOutputStream.write(annotateResponse.getDocument());
|
||||
}
|
||||
long rstart = System.currentTimeMillis();
|
||||
reanalyzeService.reanalyze(request);
|
||||
|
||||
long rend = System.currentTimeMillis();
|
||||
System.out.println("reanalysis analysis duration: " + (rend - rstart));
|
||||
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
|
||||
System.out.println("duration: " + (end - start));
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
@ -465,15 +571,16 @@ public class RedactionIntegrationTest {
|
||||
input.addAll(getPathsRecursively(file));
|
||||
}
|
||||
for (File path : input) {
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(new FileInputStream(path)))
|
||||
.build();
|
||||
|
||||
AnalyzeRequest request = prepareStorage(new FileInputStream((path)));
|
||||
System.out.println("Redacting file : " + path.getName());
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
|
||||
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
|
||||
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
|
||||
|
||||
redactionLog.getRedactionLogEntry().forEach(entry -> {
|
||||
duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry);
|
||||
});
|
||||
|
||||
@ -482,16 +589,10 @@ public class RedactionIntegrationTest {
|
||||
});
|
||||
|
||||
dictionary.get(AUTHOR).add("Drinking water");
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L);
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(1L);
|
||||
|
||||
long rstart = System.currentTimeMillis();
|
||||
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.document(IOUtils.toByteArray(new FileInputStream(path)))
|
||||
.manualRedactions(null)
|
||||
.text(result.getText())
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.build());
|
||||
reanalyzeService.reanalyze(request);
|
||||
|
||||
long rend = System.currentTimeMillis();
|
||||
System.out.println("reanalysis analysis duration: " + (rend - rstart));
|
||||
@ -526,18 +627,16 @@ public class RedactionIntegrationTest {
|
||||
@Test
|
||||
public void redactionTest() throws IOException {
|
||||
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
|
||||
var text = redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID);
|
||||
|
||||
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
|
||||
redactionLog.getRedactionLogEntry().forEach(entry -> {
|
||||
if (entry.isImage()) {
|
||||
System.out.println("---->" + entry.getType());
|
||||
}
|
||||
@ -548,13 +647,13 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("first analysis duration: " + (end - start));
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) {
|
||||
fileOutputStream.write(objectMapper.writeValueAsBytes(result.getText()));
|
||||
fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID)));
|
||||
}
|
||||
|
||||
int correctFound = 0;
|
||||
loop:
|
||||
for (RedactionLogEntry redactionLogEntry : result.getRedactionLog().getRedactionLogEntry()) {
|
||||
for (SectionText sectionText : result.getText().getSectionTexts()) {
|
||||
for (RedactionLogEntry redactionLogEntry : redactionLog.getRedactionLogEntry()) {
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
if (redactionLogEntry.isImage()) {
|
||||
correctFound++;
|
||||
continue loop;
|
||||
@ -570,7 +669,7 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
}
|
||||
}
|
||||
assertThat(correctFound).isEqualTo(result.getRedactionLog().getRedactionLogEntry().size());
|
||||
assertThat(correctFound).isEqualTo(redactionLog.getRedactionLogEntry().size());
|
||||
|
||||
dictionary.get(AUTHOR).add("properties");
|
||||
reanlysisVersions.put("properties", 1L);
|
||||
@ -581,25 +680,19 @@ public class RedactionIntegrationTest {
|
||||
dictionary.get(VERTEBRATE).add("s-metolachlor");
|
||||
reanlysisVersions.put("s-metolachlor", 3L);
|
||||
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(3L);
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(3L);
|
||||
|
||||
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(VERTEBRATE));
|
||||
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false));
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.text(result.getText())
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.build());
|
||||
AnalyzeResult reanalyzeResult = reanalyzeService.reanalyze(request);
|
||||
|
||||
end = System.currentTimeMillis();
|
||||
System.out.println("reanalysis analysis duration: " + (end - start));
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(reanalyzeResult.getRedactionLog())
|
||||
.sectionGrid(result.getSectionGrid())
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
|
||||
@ -614,19 +707,13 @@ public class RedactionIntegrationTest {
|
||||
|
||||
System.out.println("testTableRedaction");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.sectionGrid(result.getSectionGrid())
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
|
||||
@ -681,13 +768,10 @@ public class RedactionIntegrationTest {
|
||||
|
||||
// manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.manualRedactions(manualRedactions)
|
||||
.build();
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
request.setManualRedactions(manualRedactions);
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
|
||||
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
|
||||
manualRedactions.setIdsToRemove(Set.of(IdRemoval.builder()
|
||||
@ -695,20 +779,15 @@ public class RedactionIntegrationTest {
|
||||
.status(Status.APPROVED)
|
||||
.build()));
|
||||
|
||||
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.manualRedactions(manualRedactions)
|
||||
.text(result.getText())
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.build());
|
||||
reanalyzeService.reanalyze(request);
|
||||
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(reanalyzeResult.getRedactionLog())
|
||||
.sectionGrid(result.getSectionGrid())
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
|
||||
fileOutputStream.write(annotateResponse.getDocument());
|
||||
}
|
||||
@ -725,11 +804,16 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("classificationTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.projectId(request.getProjectId())
|
||||
.fileId(request.getFileId())
|
||||
.ruleSetId(request.getRuleSetId())
|
||||
.build();
|
||||
|
||||
RedactionResult result = redactionController.classify(request);
|
||||
RedactionResult result = redactionController.classify(redactionRequest);
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Classified.pdf")) {
|
||||
fileOutputStream.write(result.getDocument());
|
||||
@ -743,11 +827,15 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("sectionsTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.projectId(request.getProjectId())
|
||||
.fileId(request.getFileId())
|
||||
.ruleSetId(request.getRuleSetId())
|
||||
.build();
|
||||
|
||||
RedactionResult result = redactionController.sections(request);
|
||||
RedactionResult result = redactionController.sections(redactionRequest);
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Sections.pdf")) {
|
||||
fileOutputStream.write(result.getDocument());
|
||||
@ -761,11 +849,15 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("htmlTablesTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.projectId(request.getProjectId())
|
||||
.fileId(request.getFileId())
|
||||
.ruleSetId(request.getRuleSetId())
|
||||
.build();
|
||||
|
||||
RedactionResult result = redactionController.htmlTables(request);
|
||||
RedactionResult result = redactionController.htmlTables(redactionRequest);
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) {
|
||||
fileOutputStream.write(result.getDocument());
|
||||
@ -779,11 +871,15 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("htmlTableRotationTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.projectId(request.getProjectId())
|
||||
.fileId(request.getFileId())
|
||||
.ruleSetId(request.getRuleSetId())
|
||||
.build();
|
||||
|
||||
RedactionResult result = redactionController.htmlTables(request);
|
||||
RedactionResult result = redactionController.htmlTables(redactionRequest);
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) {
|
||||
fileOutputStream.write(result.getDocument());
|
||||
@ -796,20 +892,45 @@ public class RedactionIntegrationTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Phantom Cells.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
|
||||
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
|
||||
|
||||
redactionLog.getRedactionLogEntry().forEach(entry -> {
|
||||
if (!entry.isHint()) {
|
||||
assertThat(entry.getReason()).isEqualTo("Not redacted because row is not a vertebrate study");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private AnalyzeRequest prepareStorage(String file) {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(file);
|
||||
|
||||
return prepareStorage(pdfFileResource.getInputStream());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private AnalyzeRequest prepareStorage(InputStream stream) {
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.lastProcessed(OffsetDateTime.now())
|
||||
.build();
|
||||
|
||||
var bytes = IOUtils.toByteArray(stream);
|
||||
|
||||
storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_PROJECT_ID, TEST_FILE_ID, FileType.ORIGIN), bytes);
|
||||
|
||||
return request;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void sponsorCompanyTest() throws IOException {
|
||||
@ -817,17 +938,14 @@ public class RedactionIntegrationTest {
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/sponsor_companies.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.sectionGrid(result.getSectionGrid())
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
|
||||
@ -858,4 +976,4 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,12 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.Colors;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.amazonaws.services.s3.AmazonS3;
|
||||
import com.iqser.red.service.configuration.v1.api.model.*;
|
||||
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
|
||||
import com.iqser.red.service.redaction.v1.server.Application;
|
||||
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
@ -14,8 +12,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
@ -26,10 +23,14 @@ import org.kie.api.builder.KieFileSystem;
|
||||
import org.kie.api.builder.KieModule;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.context.TestConfiguration;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
@ -40,21 +41,15 @@ import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@SpringBootTest
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import(EntityRedactionServiceTest.RedactionIntegrationTestConfiguration.class)
|
||||
public class EntityRedactionServiceTest {
|
||||
|
||||
private static final String DEFAULT_RULES = loadFromClassPath("drools/rules.drl");
|
||||
@ -80,9 +75,13 @@ public class EntityRedactionServiceTest {
|
||||
@Autowired
|
||||
private DroolsExecutionService droolsExecutionService;
|
||||
|
||||
@MockBean
|
||||
private AmazonS3 amazonS3;
|
||||
|
||||
private final static String TEST_RULESET_ID = "123";
|
||||
|
||||
@TestConfiguration
|
||||
@Configuration
|
||||
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
|
||||
public static class RedactionIntegrationTestConfiguration {
|
||||
|
||||
@Bean
|
||||
@ -101,6 +100,13 @@ public class EntityRedactionServiceTest {
|
||||
return kieServices.newKieContainer(kieModule.getReleaseId());
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
return new FileSystemBackedStorageService();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -108,8 +114,8 @@ public class EntityRedactionServiceTest {
|
||||
public void testNestedEntitiesRemoval() {
|
||||
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false);
|
||||
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false);
|
||||
Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false, false);
|
||||
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false, false);
|
||||
entities.add(nested);
|
||||
entities.add(nesting);
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
@ -125,31 +131,25 @@ public class EntityRedactionServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
|
||||
.build();
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
|
||||
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
|
||||
}
|
||||
|
||||
|
||||
@ -158,30 +158,24 @@ public class EntityRedactionServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/nested_redaction.pdf");
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
|
||||
.build();
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
|
||||
}
|
||||
|
||||
|
||||
@ -190,64 +184,58 @@ public class EntityRedactionServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Cyprodinil/40 Cyprodinil - EU AIR3 - LCA Section 1" +
|
||||
" Supplement - Identity of the active substance - Reference list.pdf");
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()
|
||||
.entrySet()
|
||||
.stream()
|
||||
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
|
||||
}
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()
|
||||
.entrySet()
|
||||
.stream()
|
||||
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
|
||||
pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " +
|
||||
"the plant protection product.pdf");
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()
|
||||
.entrySet()
|
||||
.stream()
|
||||
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
|
||||
}
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()
|
||||
.entrySet()
|
||||
.stream()
|
||||
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFalsePositiveInWrongCell() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf");
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 9)
|
||||
.count()).isEqualTo(10);
|
||||
}
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 9)
|
||||
.count()).isEqualTo(10);
|
||||
|
||||
}
|
||||
|
||||
@ -296,27 +284,25 @@ public class EntityRedactionServiceTest {
|
||||
droolsExecutionService.updateRules(TEST_RULESET_ID);
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf");
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 6)
|
||||
.count()).isEqualTo(13);
|
||||
}
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 6)
|
||||
.count()).isEqualTo(13);
|
||||
|
||||
}
|
||||
|
||||
@ -337,27 +323,25 @@ public class EntityRedactionServiceTest {
|
||||
droolsExecutionService.updateRules(TEST_RULESET_ID);
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/batches_new_line.pdf");
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
DictionaryResponse authorResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(authorResponse);
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(authorResponse);
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 11)
|
||||
.count()).isEqualTo(1);
|
||||
}
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 11)
|
||||
.count()).isEqualTo(1);
|
||||
|
||||
}
|
||||
|
||||
@ -371,24 +355,22 @@ public class EntityRedactionServiceTest {
|
||||
.entries(toDictionaryEntry(Arrays.asList("Bissig R.", "Thanei P.")))
|
||||
.build();
|
||||
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
|
||||
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
|
||||
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
|
||||
}
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
|
||||
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
|
||||
|
||||
pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf");
|
||||
|
||||
@ -396,20 +378,18 @@ public class EntityRedactionServiceTest {
|
||||
.entries(toDictionaryEntry(Arrays.asList("Tribolet, R.", "Muir, G.", "Kühne-Thu, H.", "Close, C.")))
|
||||
.build();
|
||||
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
addressResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
|
||||
}
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
|
||||
}
|
||||
|
||||
|
||||
@ -423,23 +403,21 @@ public class EntityRedactionServiceTest {
|
||||
.entries(toDictionaryEntry(Collections.singletonList("Aldershof S.")))
|
||||
.build();
|
||||
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
|
||||
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
|
||||
}
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null, "dossierId");
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
|
||||
}
|
||||
|
||||
|
||||
@ -476,19 +454,19 @@ public class EntityRedactionServiceTest {
|
||||
TypeResult.builder().ruleSetId(TEST_RULESET_ID).type(ADDRESS_CODE).hexColor("#ff00ff").build(),
|
||||
TypeResult.builder().ruleSetId(TEST_RULESET_ID).type(SPONSOR_CODE).hexColor("#00ffff").build()))
|
||||
.build();
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getAllTypes(TEST_RULESET_ID)).thenReturn(typeResponse);
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getAllTypes(TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(typeResponse);
|
||||
|
||||
// Default empty return to prevent NPEs
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
|
||||
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
|
||||
|
||||
Colors colors = new Colors();
|
||||
colors.setDefaultColor("#acfc00");
|
||||
@ -518,7 +496,7 @@ public class EntityRedactionServiceTest {
|
||||
}
|
||||
}
|
||||
|
||||
private List<DictionaryEntry> toDictionaryEntry(List<String> entries){
|
||||
private List<DictionaryEntry> toDictionaryEntry(List<String> entries) {
|
||||
List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
|
||||
entries.forEach(entry -> {
|
||||
dictionaryEntries.add(new DictionaryEntry(entry, 1L, false));
|
||||
@ -526,4 +504,4 @@ public class EntityRedactionServiceTest {
|
||||
return dictionaryEntries;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,7 +1,31 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import com.amazonaws.services.s3.AmazonS3;
|
||||
import com.iqser.red.service.redaction.v1.server.Application;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
@ -9,31 +33,12 @@ import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
|
||||
@SpringBootTest
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import(PdfSegmentationServiceTest.TestConfiguration.class)
|
||||
public class PdfSegmentationServiceTest {
|
||||
|
||||
@Autowired
|
||||
@ -51,6 +56,28 @@ public class PdfSegmentationServiceTest {
|
||||
@MockBean
|
||||
private KieContainer kieContainer;
|
||||
|
||||
@MockBean
|
||||
private AmazonS3 amazonS3;
|
||||
|
||||
@MockBean
|
||||
private RabbitTemplate rabbitTemplate;
|
||||
|
||||
@Configuration
|
||||
@EnableAutoConfiguration(exclude = { RabbitAutoConfiguration.class})
|
||||
public static class TestConfiguration {
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMergeImages() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/270Rotated.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
assertThat(document.getPages().get(0).getImages().size()).isEqualTo(1);
|
||||
assertThat(document.getPages().get(1).getImages().size()).isEqualTo(0);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
@ -58,61 +85,78 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
int i = 0;
|
||||
for (Page page : document.getPages()) {
|
||||
for (PdfImage image : page.getImages()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
|
||||
fileOutputStream.write(baos.toByteArray());
|
||||
}
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
int i = 0;
|
||||
for (Page page : document.getPages()) {
|
||||
for (PdfImage image : page.getImages()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
|
||||
fileOutputStream.write(baos.toByteArray());
|
||||
}
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testPDFSegmentationWithComplexTable() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table table = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testTableExtraction() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -121,38 +165,36 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -161,38 +203,36 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
configuration-service.url: "http://configuration-service-v1:8080"
|
||||
image-service.url: "http://image-service-v1:8080"
|
||||
file-management-service.url: "http://file-management-service-v1:8080"
|
||||
|
||||
ribbon:
|
||||
ConnectTimeout: 600000
|
||||
|
||||
@ -0,0 +1 @@
|
||||
Difenoconazole
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user