Reworked re-analysis and analysis to use memory model / directly read/store files, and dumped pd doc wherever possible
This commit is contained in:
parent
ed59f36220
commit
5cb4ea287c
@ -5,7 +5,7 @@
|
||||
<parent>
|
||||
<artifactId>platform-dependency</artifactId>
|
||||
<groupId>com.iqser.red</groupId>
|
||||
<version>1.0.8</version>
|
||||
<version>1.1.2</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
@ -32,7 +32,7 @@
|
||||
<dependency>
|
||||
<groupId>com.iqser.red</groupId>
|
||||
<artifactId>platform-commons-dependency</artifactId>
|
||||
<version>1.2.5</version>
|
||||
<version>1.2.9</version>
|
||||
<scope>import</scope>
|
||||
<type>pom</type>
|
||||
</dependency>
|
||||
@ -52,4 +52,4 @@
|
||||
|
||||
</dependencyManagement>
|
||||
|
||||
</project>
|
||||
</project>
|
||||
|
||||
@ -5,13 +5,19 @@ import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class AnalyzeRequest {
|
||||
|
||||
private byte[] document;
|
||||
private String projectId;
|
||||
private String fileId;
|
||||
private String ruleSetId;
|
||||
private ManualRedactions manualRedactions;
|
||||
private OffsetDateTime lastProcessed;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -12,8 +12,11 @@ import lombok.NoArgsConstructor;
|
||||
public class AnalyzeResult {
|
||||
|
||||
private int numberOfPages;
|
||||
private RedactionLog redactionLog;
|
||||
private SectionGrid sectionGrid;
|
||||
private Text text;
|
||||
private boolean hasHints;
|
||||
private boolean hasRequests;
|
||||
private boolean hasRedactions;
|
||||
private boolean hasImages;
|
||||
private boolean hasUpdates;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -11,7 +11,6 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
public class AnnotateRequest {
|
||||
|
||||
private byte[] document;
|
||||
private RedactionLog redactionLog;
|
||||
private SectionGrid sectionGrid;
|
||||
private String projectId;
|
||||
private String fileId;
|
||||
}
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
|
||||
@ -1,16 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
|
||||
@ -11,7 +11,8 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
public class RedactionRequest {
|
||||
|
||||
private byte[] document;
|
||||
private String projectId;
|
||||
private String fileId;
|
||||
private String ruleSetId;
|
||||
private ManualRedactions manualRedactions;
|
||||
}
|
||||
|
||||
@ -13,7 +13,5 @@ public class RedactionResult {
|
||||
|
||||
private byte[] document;
|
||||
private int numberOfPages;
|
||||
private RedactionLog redactionLog;
|
||||
private SectionGrid sectionGrid;
|
||||
|
||||
}
|
||||
|
||||
@ -1,22 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class RenalyzeRequest {
|
||||
|
||||
private byte[] document;
|
||||
private String ruleSetId;
|
||||
private ManualRedactions manualRedactions;
|
||||
private Text text;
|
||||
private RedactionLog redactionLog;
|
||||
private OffsetDateTime lastProcessed;
|
||||
}
|
||||
@ -27,7 +27,7 @@ public class SectionArea {
|
||||
private String header;
|
||||
|
||||
public boolean contains(Rectangle other) {
|
||||
return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight();
|
||||
return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SectionText {
|
||||
|
||||
private int sectionNumber;
|
||||
private String text;
|
||||
|
||||
private boolean isTable;
|
||||
private String headline;
|
||||
|
||||
private List<SectionArea> sectionAreas = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,14 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.resources;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
|
||||
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
@ -25,7 +17,7 @@ public interface RedactionResource {
|
||||
AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest);
|
||||
|
||||
@PostMapping(value = "/reanalyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest);
|
||||
AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest);
|
||||
|
||||
@PostMapping(value = "/annotate", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest);
|
||||
@ -39,10 +31,10 @@ public interface RedactionResource {
|
||||
@PostMapping(value = "/debug/htmlTables", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest);
|
||||
|
||||
@PostMapping(value = "/rules/update"+RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
@PostMapping(value = "/rules/update" + RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
void updateRules(@PathVariable(RULE_SET_PARAMETER_NAME) String ruleSetId);
|
||||
|
||||
@PostMapping(value = "/rules/test", consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
void testRules(@RequestBody String rules);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -12,6 +12,10 @@
|
||||
<artifactId>redaction-service-server-v1</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>storage-commons</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>redaction-service-api-v1</artifactId>
|
||||
@ -20,7 +24,12 @@
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>configuration-service-api-v1</artifactId>
|
||||
<version>2.2.9</version>
|
||||
<version>2.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>file-management-service-api-v1</artifactId>
|
||||
<version>2.6.7</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.drools</groupId>
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
@ -8,10 +11,6 @@ import org.springframework.boot.context.properties.EnableConfigurationProperties
|
||||
import org.springframework.cloud.openfeign.EnableFeignClients;
|
||||
import org.springframework.context.annotation.Import;
|
||||
|
||||
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
|
||||
@Import({DefaultWebMvcConfiguration.class})
|
||||
@EnableFeignClients(basePackageClasses = RulesClient.class)
|
||||
@EnableConfigurationProperties(RedactionServiceSettings.class)
|
||||
@ -23,4 +22,4 @@ public class Application {
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,20 +1,18 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class Document {
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
@ -7,38 +9,35 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class FloatFrequencyCounter
|
||||
{
|
||||
public class FloatFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
Map<Float, Integer> countPerValue = new HashMap<>();
|
||||
|
||||
public void add(float value){
|
||||
if(!countPerValue.containsKey(value)){
|
||||
public void add(float value) {
|
||||
if (!countPerValue.containsKey(value)) {
|
||||
countPerValue.put(value, 1);
|
||||
} else {
|
||||
countPerValue.put(value, countPerValue.get(value) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
public void addAll(Map<Float, Integer> otherCounter){
|
||||
for(Map.Entry<Float, Integer> entry: otherCounter.entrySet()){
|
||||
if(countPerValue.containsKey(entry.getKey())){
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey())+ entry.getValue());
|
||||
public void addAll(Map<Float, Integer> otherCounter) {
|
||||
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
|
||||
if (countPerValue.containsKey(entry.getKey())) {
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||
} else {
|
||||
countPerValue.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Float getMostPopular(){
|
||||
public Float getMostPopular() {
|
||||
Map.Entry<Float, Integer> mostPopular = null;
|
||||
for(Map.Entry<Float, Integer> entry: countPerValue.entrySet()){
|
||||
if(mostPopular == null){
|
||||
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null) {
|
||||
mostPopular = entry;
|
||||
} else if(entry.getValue() >= mostPopular.getValue()){
|
||||
} else if (entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
}
|
||||
}
|
||||
@ -46,12 +45,11 @@ public class FloatFrequencyCounter
|
||||
}
|
||||
|
||||
|
||||
|
||||
public List<Float> getHighterThanMostPopular(){
|
||||
public List<Float> getHighterThanMostPopular() {
|
||||
Float mostPopular = getMostPopular();
|
||||
List<Float> higher = new ArrayList<>();
|
||||
for(Float value: countPerValue.keySet()){
|
||||
if(value > mostPopular){
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
if (value > mostPopular) {
|
||||
higher.add(value);
|
||||
}
|
||||
}
|
||||
@ -60,12 +58,12 @@ public class FloatFrequencyCounter
|
||||
}
|
||||
|
||||
|
||||
public Float getHighest(){
|
||||
public Float getHighest() {
|
||||
Float highest = null;
|
||||
for(Float value: countPerValue.keySet()){
|
||||
if (highest == null){
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
if (highest == null) {
|
||||
highest = value;
|
||||
} else if(value > highest){
|
||||
} else if (value > highest) {
|
||||
highest = value;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,12 +1,11 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class Footer {
|
||||
@ -21,4 +20,4 @@ public class Footer {
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,12 +1,11 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class Header {
|
||||
@ -21,4 +20,4 @@ public class Header {
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,15 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class Page {
|
||||
@ -37,4 +36,4 @@ public class Page {
|
||||
return rotation != 0;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,19 +1,18 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class Paragraph implements Comparable{
|
||||
public class Paragraph implements Comparable {
|
||||
|
||||
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
|
||||
private List<PdfImage> images = new ArrayList<>();
|
||||
@ -62,4 +61,4 @@ public class Paragraph implements Comparable{
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,45 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SectionText {
|
||||
|
||||
private int sectionNumber;
|
||||
private String text;
|
||||
|
||||
private boolean isTable;
|
||||
private String headline;
|
||||
|
||||
private List<SectionArea> sectionAreas = new ArrayList<>();
|
||||
private Set<Image> images = new HashSet<>();
|
||||
|
||||
private List<TextBlock> textBlocks = new ArrayList<>();
|
||||
private Map<String, CellValue> tabularData = new HashMap<>();
|
||||
private List<Integer> cellStarts = new ArrayList<>();
|
||||
|
||||
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> {
|
||||
if (block != null) {
|
||||
searchableText.addAll(block.getSequences());
|
||||
}
|
||||
});
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class StringFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
@ -46,4 +46,4 @@ public class StringFrequencyCounter {
|
||||
return mostPopular != null ? mostPopular.getKey() : null;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,17 +1,18 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class Text {
|
||||
|
||||
private int numberOfPages;
|
||||
private List<SectionText> sectionTexts = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,16 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@Data
|
||||
@ -98,7 +97,6 @@ public class TextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
@ -139,4 +137,4 @@ public class TextBlock extends AbstractTextContainer {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,12 +1,11 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class UnclassifiedText {
|
||||
@ -21,4 +20,4 @@ public class UnclassifiedText {
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,21 +1,20 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
|
||||
@ -1,19 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
|
||||
@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.classification.utils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
|
||||
@ -1,16 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.client;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.springframework.lang.NonNull;
|
||||
import org.springframework.lang.Nullable;
|
||||
import org.springframework.util.Assert;
|
||||
import org.springframework.util.FileCopyUtils;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
public class MockMultipartFile implements MultipartFile {
|
||||
|
||||
private final String name;
|
||||
@ -22,13 +22,13 @@ public class MockMultipartFile implements MultipartFile {
|
||||
|
||||
public MockMultipartFile(String name, @Nullable byte[] content) {
|
||||
|
||||
this(name, "", (String) null, (byte[]) content);
|
||||
this(name, "", null, content);
|
||||
}
|
||||
|
||||
|
||||
public MockMultipartFile(String name, InputStream contentStream) throws IOException {
|
||||
|
||||
this(name, "", (String) null, (byte[]) FileCopyUtils.copyToByteArray(contentStream));
|
||||
this(name, "", null, FileCopyUtils.copyToByteArray(contentStream));
|
||||
}
|
||||
|
||||
|
||||
@ -78,7 +78,7 @@ public class MockMultipartFile implements MultipartFile {
|
||||
|
||||
public long getSize() {
|
||||
|
||||
return (long) this.content.length;
|
||||
return this.content.length;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,17 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.controller;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
import com.iqser.red.commons.spring.ErrorMessage;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.web.bind.annotation.ExceptionHandler;
|
||||
import org.springframework.web.bind.annotation.ResponseBody;
|
||||
import org.springframework.web.bind.annotation.ResponseStatus;
|
||||
import org.springframework.web.bind.annotation.RestControllerAdvice;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
@Slf4j
|
||||
@RestControllerAdvice
|
||||
@ -38,4 +36,4 @@ public class ControllerAdvice {
|
||||
return new ErrorMessage(OffsetDateTime.now(), e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,17 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.controller;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
|
||||
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.model.Text;
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.resources.RedactionResource;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
@ -19,27 +9,21 @@ import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.EntityRedactionService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionLogCreatorService;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
@Slf4j
|
||||
@RestController
|
||||
@ -47,61 +31,36 @@ import java.util.List;
|
||||
public class RedactionController implements RedactionResource {
|
||||
|
||||
private final PdfVisualisationService pdfVisualisationService;
|
||||
private final PdfSegmentationService pdfSegmentationService;
|
||||
private final RedactionLogCreatorService redactionLogCreatorService;
|
||||
private final EntityRedactionService entityRedactionService;
|
||||
private final DroolsExecutionService droolsExecutionService;
|
||||
private final DictionaryService dictionaryService;
|
||||
private final AnnotationService annotationService;
|
||||
private final ReanalyzeService reanalyzeService;
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
|
||||
private final PdfSegmentationService pdfSegmentationService;
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
|
||||
@Override
|
||||
public AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest) {
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(analyzeRequest.getDocument()))) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
imageClassificationService.classifyImages(classifiedDoc);
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
|
||||
redactionLogCreatorService.createRedactionLog(classifiedDoc, pdDocument.getNumberOfPages(), analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getRuleSetId());
|
||||
|
||||
log.info("Redaction analysis successful...");
|
||||
|
||||
return AnalyzeResult.builder()
|
||||
.sectionGrid(classifiedDoc.getSectionGrid())
|
||||
.redactionLog(new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion(), classifiedDoc
|
||||
.getRulesVersion(), analyzeRequest.getRuleSetId()))
|
||||
.numberOfPages(classifiedDoc.getPages().size())
|
||||
.text(new Text(classifiedDoc.getSectionText()))
|
||||
.build();
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
return reanalyzeService.analyze(analyzeRequest);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
|
||||
|
||||
return reanalyzeService.reanalyze(renalyzeRequest);
|
||||
@Override
|
||||
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
|
||||
return reanalyzeService.reanalyze(analyzeRequest);
|
||||
}
|
||||
|
||||
|
||||
public AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest) {
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(annotateRequest.getDocument()))) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(annotateRequest.getProjectId(), annotateRequest.getFileId(), FileType.ORIGIN));
|
||||
var redactionLog = redactionStorageService.getRedactionLog(annotateRequest.getProjectId(), annotateRequest.getFileId());
|
||||
var sectionsGrid = redactionStorageService.getSectionGrid(annotateRequest.getProjectId(), annotateRequest.getFileId());
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
dictionaryService.updateDictionary(annotateRequest.getRedactionLog().getRuleSetId());
|
||||
annotationService.annotate(pdDocument, annotateRequest.getRedactionLog(), annotateRequest.getSectionGrid());
|
||||
dictionaryService.updateDictionary(redactionLog.getRuleSetId());
|
||||
annotationService.annotate(pdDocument, redactionLog, sectionsGrid);
|
||||
|
||||
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
|
||||
pdDocument.save(byteArrayOutputStream);
|
||||
@ -115,15 +74,16 @@ public class RedactionController implements RedactionResource {
|
||||
|
||||
|
||||
@Override
|
||||
public RedactionResult classify(@RequestBody RedactionRequest pdfSegmentationRequest) {
|
||||
public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(pdfSegmentationRequest.getDocument()))) {
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
|
||||
|
||||
return convert(pdDocument, classifiedDoc.getPages().size(), pdfSegmentationRequest.getRuleSetId());
|
||||
return convert(pdDocument, classifiedDoc.getPages().size());
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
@ -134,14 +94,15 @@ public class RedactionController implements RedactionResource {
|
||||
|
||||
@Override
|
||||
public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
|
||||
|
||||
return convert(pdDocument, classifiedDoc.getPages().size(), redactionRequest.getRuleSetId());
|
||||
return convert(pdDocument, classifiedDoc.getPages().size());
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
@ -153,27 +114,29 @@ public class RedactionController implements RedactionResource {
|
||||
@Override
|
||||
public RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest) {
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
|
||||
Document classifiedDoc;
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (Page page : classifiedDoc.getPages()) {
|
||||
for (AbstractTextContainer textContainer : page.getTextBlocks()) {
|
||||
if (textContainer instanceof Table) {
|
||||
Table table = (Table) textContainer;
|
||||
sb.append(table.getTextAsHtml()).append("<br />").append("<br />");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return RedactionResult.builder().document(sb.toString().getBytes()).build();
|
||||
|
||||
} catch (IOException e) {
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (Page page : classifiedDoc.getPages()) {
|
||||
for (AbstractTextContainer textContainer : page.getTextBlocks()) {
|
||||
if (textContainer instanceof Table) {
|
||||
Table table = (Table) textContainer;
|
||||
sb.append(table.getTextAsHtml()).append("<br />").append("<br />");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return RedactionResult.builder().document(sb.toString().getBytes()).build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -191,23 +154,13 @@ public class RedactionController implements RedactionResource {
|
||||
}
|
||||
|
||||
|
||||
private RedactionResult convert(PDDocument document, int numberOfPages, String ruleSetId) throws IOException {
|
||||
|
||||
return convert(document, numberOfPages, null, null, 0, 0, ruleSetId);
|
||||
}
|
||||
|
||||
|
||||
private RedactionResult convert(PDDocument document, int numberOfPages,
|
||||
List<RedactionLogEntry> redactionLogEntities, SectionGrid sectionGrid,
|
||||
long dictionaryVersion, long rulesVersion, String ruleSetId) throws IOException {
|
||||
private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException {
|
||||
|
||||
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
|
||||
document.save(byteArrayOutputStream);
|
||||
return RedactionResult.builder()
|
||||
.document(byteArrayOutputStream.toByteArray())
|
||||
.numberOfPages(numberOfPages)
|
||||
.redactionLog(new RedactionLog(redactionLogEntities, dictionaryVersion, rulesVersion, ruleSetId))
|
||||
.sectionGrid(sectionGrid)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@ -1,17 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
public class PDFAreaTextStripper extends PDFTextStripperByArea {
|
||||
|
||||
@Getter
|
||||
@ -76,7 +74,7 @@ public class PDFAreaTextStripper extends PDFTextStripperByArea {
|
||||
}
|
||||
|
||||
|
||||
public void clearPositions(){
|
||||
public void clearPositions() {
|
||||
textPositionSequences = new ArrayList<>();
|
||||
}
|
||||
|
||||
|
||||
@ -1,33 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
|
||||
import org.apache.pdfbox.contentstream.operator.color.*;
|
||||
import org.apache.pdfbox.contentstream.operator.state.*;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
@ -40,40 +22,31 @@ import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Slf4j
|
||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
@Setter
|
||||
protected PDPage pdpage;
|
||||
|
||||
@Getter
|
||||
private int minCharWidth;
|
||||
|
||||
@Getter
|
||||
private int maxCharWidth;
|
||||
|
||||
@Getter
|
||||
private int minCharHeight;
|
||||
|
||||
@Getter
|
||||
private int maxCharHeight;
|
||||
|
||||
@Getter
|
||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
|
||||
@Getter
|
||||
private final List<Ruling> rulings = new ArrayList<>();
|
||||
|
||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||
|
||||
@Setter
|
||||
protected PDPage pdpage;
|
||||
@Getter
|
||||
private int minCharWidth;
|
||||
@Getter
|
||||
private int maxCharWidth;
|
||||
@Getter
|
||||
private int minCharHeight;
|
||||
@Getter
|
||||
private int maxCharHeight;
|
||||
@Getter
|
||||
private List<PdfImage> images = new ArrayList<>();
|
||||
|
||||
@ -369,4 +342,4 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,23 +1,20 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
|
||||
private List<TextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
private final int page;
|
||||
private List<TextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
||||
@ -223,4 +220,4 @@ public class TextPositionSequence implements CharSequence {
|
||||
return new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,14 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Value;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
@Value
|
||||
public class CellValue {
|
||||
|
||||
@ -47,4 +46,4 @@ public class CellValue {
|
||||
.replaceAll(" {2}", " ");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
|
||||
@Data
|
||||
public class Dictionary {
|
||||
|
||||
@ -21,15 +21,15 @@ public class Dictionary {
|
||||
private long version;
|
||||
|
||||
|
||||
public Dictionary(List<DictionaryModel> dictionaryModels, long dictionaryVersion){
|
||||
public Dictionary(List<DictionaryModel> dictionaryModels, long dictionaryVersion) {
|
||||
this.dictionaryModels = dictionaryModels;
|
||||
this.dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), dm));
|
||||
this.version = dictionaryVersion;
|
||||
}
|
||||
|
||||
|
||||
public int getDictionaryRank(String type){
|
||||
if(!localAccessMap.containsKey(type)){
|
||||
public int getDictionaryRank(String type) {
|
||||
if (!localAccessMap.containsKey(type)) {
|
||||
return 0;
|
||||
}
|
||||
return localAccessMap.get(type).getRank();
|
||||
@ -60,7 +60,7 @@ public class Dictionary {
|
||||
|
||||
public boolean containsValue(String type, String value) {
|
||||
|
||||
if (localAccessMap.containsKey(type) && localAccessMap.get(type)
|
||||
return localAccessMap.containsKey(type) && localAccessMap.get(type)
|
||||
.getEntries()
|
||||
.contains(value) || localAccessMap.containsKey(type) && localAccessMap.get(type)
|
||||
.getLocalEntries()
|
||||
@ -68,10 +68,7 @@ public class Dictionary {
|
||||
.getEntries()
|
||||
.contains(value) || localAccessMap.containsKey(RECOMMENDATION_PREFIX + type) && localAccessMap.get(RECOMMENDATION_PREFIX + type)
|
||||
.getLocalEntries()
|
||||
.contains(value)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
.contains(value);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class DictionaryIncrement {
|
||||
|
||||
@ -1,15 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class DictionaryModel implements Serializable {
|
||||
@ -23,8 +22,8 @@ public class DictionaryModel implements Serializable {
|
||||
private Set<DictionaryEntry> entries;
|
||||
private Set<String> localEntries;
|
||||
|
||||
public Set<String> getValues(boolean local){
|
||||
return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e-> e.getValue()).collect(Collectors
|
||||
public Set<String> getValues(boolean local) {
|
||||
return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e -> e.getValue()).collect(Collectors
|
||||
.toSet());
|
||||
}
|
||||
|
||||
|
||||
@ -20,5 +20,4 @@ public class DictionaryRepresentation {
|
||||
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,13 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class Entity {
|
||||
|
||||
@ -1,24 +1,23 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@EqualsAndHashCode
|
||||
public class EntityPositionSequence {
|
||||
|
||||
private final String id;
|
||||
@EqualsAndHashCode.Exclude
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
private int pageNumber;
|
||||
private final String id;
|
||||
|
||||
}
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
|
||||
@ -1,14 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@ -25,4 +25,4 @@ public class PdfImage {
|
||||
@NonNull
|
||||
private int page;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,37 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ReanalysisSection {
|
||||
|
||||
private int sectionNumber;
|
||||
private String headline;
|
||||
private List<TextBlock> textBlocks;
|
||||
private Map<String, CellValue> tabularData = new HashMap<>();
|
||||
private List<Integer> cellStarts;
|
||||
private Set<Image> images = new HashSet<>();
|
||||
|
||||
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> {
|
||||
if (block instanceof TextBlock) {
|
||||
searchableText.addAll(block.getSequences());
|
||||
}
|
||||
});
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,14 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
public class SearchableText {
|
||||
|
||||
private final List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
@ -232,4 +232,4 @@ public class SearchableText {
|
||||
return sb.append("\n").toString();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,6 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
@ -11,15 +17,7 @@ import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
|
||||
|
||||
@Data
|
||||
@Slf4j
|
||||
|
||||
@ -0,0 +1,41 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import com.iqser.red.service.file.management.v1.api.model.RedactionChangeLog;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
public class AnalyzeResponseService {
|
||||
|
||||
public AnalyzeResult createAnalyzeResponse(int pageCount, RedactionLog redactionLog, RedactionChangeLog redactionChangeLog) {
|
||||
boolean hasHints = redactionLog.getRedactionLogEntry().stream().anyMatch(RedactionLogEntry::isHint);
|
||||
|
||||
boolean hasRequests = redactionLog.getRedactionLogEntry()
|
||||
.stream()
|
||||
.anyMatch(entry -> entry.isManual() && entry.getStatus()
|
||||
.equals(com.iqser.red.service.redaction.v1.model.Status.REQUESTED));
|
||||
|
||||
boolean hasRedactions = redactionLog.getRedactionLogEntry()
|
||||
.stream()
|
||||
.anyMatch(entry -> entry.isRedacted() && !entry.isManual() || entry.isManual() && entry.getStatus()
|
||||
.equals(com.iqser.red.service.redaction.v1.model.Status.APPROVED));
|
||||
|
||||
boolean hasImages = redactionLog.getRedactionLogEntry()
|
||||
.stream()
|
||||
.anyMatch(entry -> entry.isHint() && entry.getType().equals("image"));
|
||||
|
||||
boolean hasUpdates = redactionChangeLog != null && redactionChangeLog.getRedactionLogEntry() != null && !redactionChangeLog
|
||||
.getRedactionLogEntry()
|
||||
.isEmpty() && redactionChangeLog.getRedactionLogEntry().stream().anyMatch(entry -> !entry.getType().equals("false_positive"));
|
||||
|
||||
return AnalyzeResult.builder()
|
||||
.numberOfPages(pageCount)
|
||||
.hasHints(hasHints)
|
||||
.hasRedactions(hasRedactions)
|
||||
.hasRequests(hasRequests)
|
||||
.hasImages(hasImages)
|
||||
.hasUpdates(hasUpdates).build();
|
||||
}
|
||||
}
|
||||
@ -1,14 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.GregorianCalendar;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
@ -21,15 +14,14 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
|
||||
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.CellRectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.Comment;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import java.awt.Color;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.GregorianCalendar;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
|
||||
@ -1,20 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.Colors;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
|
||||
@ -25,10 +10,16 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncre
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation;
|
||||
|
||||
import feign.FeignException;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -37,7 +28,7 @@ public class DictionaryService {
|
||||
|
||||
private final DictionaryClient dictionaryClient;
|
||||
|
||||
private Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>();
|
||||
private final Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>();
|
||||
|
||||
|
||||
public long updateDictionary(String ruleSetId) {
|
||||
@ -212,4 +203,4 @@ public class DictionaryService {
|
||||
return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.kie.api.KieServices;
|
||||
import org.kie.api.builder.KieBuilder;
|
||||
@ -15,12 +14,11 @@ import org.kie.api.runtime.KieContainer;
|
||||
import org.kie.api.runtime.KieSession;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@ -28,9 +26,9 @@ public class DroolsExecutionService {
|
||||
|
||||
private final RulesClient rulesClient;
|
||||
|
||||
private Map<String, KieContainer> kieContainers = new HashMap<>();
|
||||
private final Map<String, KieContainer> kieContainers = new HashMap<>();
|
||||
|
||||
private Map<String, Long> rulesVersionPerRuleSetId = new HashMap<>();
|
||||
private final Map<String, Long> rulesVersionPerRuleSetId = new HashMap<>();
|
||||
|
||||
|
||||
public KieContainer getKieContainer(String ruleSetId) {
|
||||
@ -133,4 +131,4 @@ public class DroolsExecutionService {
|
||||
return rulesVersion.longValue();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,50 +1,27 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
|
||||
@ -1,21 +1,18 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
|
||||
@ -1,53 +1,29 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Comment;
|
||||
import com.iqser.red.service.redaction.v1.model.IdRemoval;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class ReanalyzeService {
|
||||
@ -57,13 +33,63 @@ public class ReanalyzeService {
|
||||
private final SurroundingWordsService surroundingWordsService;
|
||||
private final EntityRedactionService entityRedactionService;
|
||||
private final RedactionLogCreatorService redactionLogCreatorService;
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
private final PdfSegmentationService pdfSegmentationService;
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
private final RedactionChangeLogService redactionChangeLogService;
|
||||
private final AnalyzeResponseService analyzeResponseService;
|
||||
|
||||
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
|
||||
|
||||
var pageCount = 0;
|
||||
Document classifiedDoc;
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
pageCount = pdDocument.getNumberOfPages();
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
imageClassificationService.classifyImages(classifiedDoc);
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
|
||||
imageClassificationService.classifyImages(classifiedDoc);
|
||||
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getRuleSetId());
|
||||
|
||||
log.info("Redaction analysis successful...");
|
||||
|
||||
|
||||
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
|
||||
var redactionLog = new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion(), classifiedDoc
|
||||
.getRulesVersion(), analyzeRequest.getRuleSetId());
|
||||
|
||||
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), renalyzeRequest
|
||||
.getRedactionLog()
|
||||
.getDictionaryVersion());
|
||||
// first create changelog - this only happens when we migrate files analyzed via the old process and we don't want to loose changeLog data
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
|
||||
// store redactionLog
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.TEXT, new Text(pageCount, classifiedDoc.getSectionText()));
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc.getSectionGrid());
|
||||
|
||||
return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog);
|
||||
}
|
||||
|
||||
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) {
|
||||
var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
|
||||
// new procedure was not applied, we need a complete analysis
|
||||
if (text.getNumberOfPages() == 0) {
|
||||
return analyze(AnalyzeRequest.builder()
|
||||
.ruleSetId(renalyzeRequest.getRuleSetId())
|
||||
.manualRedactions(renalyzeRequest.getManualRedactions())
|
||||
.projectId(renalyzeRequest.getProjectId())
|
||||
.fileId(renalyzeRequest.getFileId())
|
||||
.build());
|
||||
}
|
||||
var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
|
||||
|
||||
|
||||
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
|
||||
|
||||
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
|
||||
Map<String, List<Comment>> comments = null;
|
||||
@ -75,21 +101,21 @@ public class ReanalyzeService {
|
||||
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
|
||||
}
|
||||
|
||||
Set<Integer> sectionsToReanaylse = new HashSet<>();
|
||||
Set<Integer> sectionsToReanalyse = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imageEntries = new HashMap<>();
|
||||
for (RedactionLogEntry entry : renalyzeRequest.getRedactionLog().getRedactionLogEntry()) {
|
||||
for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) {
|
||||
if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) {
|
||||
sectionsToReanaylse.add(entry.getSectionNumber());
|
||||
sectionsToReanalyse.add(entry.getSectionNumber());
|
||||
}
|
||||
if (entry.isImage() || entry.getType().equals("image")) {
|
||||
imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry));
|
||||
}
|
||||
}
|
||||
|
||||
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
|
||||
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
|
||||
sectionsToReanaylse.add(sectionText.getSectionNumber());
|
||||
sectionsToReanalyse.add(sectionText.getSectionNumber());
|
||||
}
|
||||
|
||||
if (manualAdds != null) {
|
||||
@ -106,97 +132,30 @@ public class ReanalyzeService {
|
||||
}
|
||||
}
|
||||
|
||||
if (sectionsToReanaylse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
|
||||
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
|
||||
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
|
||||
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
|
||||
}
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(renalyzeRequest.getDocument()))) {
|
||||
try {
|
||||
|
||||
List<ReanalysisSection> reanalysisSections = new ArrayList<>();
|
||||
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
|
||||
List<SectionText> reanalysisSections = new ArrayList<>();
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
|
||||
if (!sectionsToReanaylse.contains(sectionText.getSectionNumber())) {
|
||||
continue;
|
||||
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
|
||||
reanalysisSections.add(sectionText);
|
||||
}
|
||||
|
||||
ReanalysisSection reanalysisSection = new ReanalysisSection();
|
||||
reanalysisSection.setHeadline(sectionText.getHeadline());
|
||||
reanalysisSection.setSectionNumber(sectionText.getSectionNumber());
|
||||
List<TextBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
|
||||
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
|
||||
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
|
||||
.add(sectionArea);
|
||||
}
|
||||
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
List<Integer> cellStarts = new ArrayList<>();
|
||||
for (Integer page : sectionAreasPerPage.keySet()) {
|
||||
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
|
||||
|
||||
PDPage pdPage = pdDocument.getPage(page - 1);
|
||||
PDRectangle cropBox = pdPage.getCropBox();
|
||||
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
|
||||
textStripper.setPageNumber(page);
|
||||
|
||||
int cellStart = 0;
|
||||
for (SectionArea sectionArea : areasOnPage) {
|
||||
|
||||
Rectangle2D rect = null;
|
||||
if (pdPage.getRotation() == 90) {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
|
||||
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
|
||||
} else {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
|
||||
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
|
||||
.getHeight() + 0.001f);
|
||||
}
|
||||
|
||||
textStripper.addRegion(String.valueOf(1), rect);
|
||||
textStripper.extractRegions(pdPage);
|
||||
textStripper.getTextForRegion(String.valueOf(1));
|
||||
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
|
||||
|
||||
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
|
||||
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
|
||||
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
|
||||
|
||||
if (sectionText.isTable()) {
|
||||
Cell cell = new Cell();
|
||||
cell.addTextBlock(textBlock);
|
||||
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
|
||||
cellStarts.add(cellStart);
|
||||
cellStart = cellStart + cell.toString().trim().length() + 1;
|
||||
}
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
textStripper.clearPositions();
|
||||
}
|
||||
|
||||
}
|
||||
reanalysisSection.setTextBlocks(textBlocks);
|
||||
reanalysisSection.setTabularData(tabularData);
|
||||
|
||||
if (sectionText.isTable()) {
|
||||
reanalysisSection.setCellStarts(cellStarts);
|
||||
}
|
||||
if (imageEntries.containsKey(sectionText.getSectionNumber())) {
|
||||
reanalysisSection.getImages().addAll(imageEntries.get(sectionText.getSectionNumber()));
|
||||
}
|
||||
|
||||
reanalysisSections.add(reanalysisSection);
|
||||
}
|
||||
|
||||
//--
|
||||
|
||||
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
|
||||
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
|
||||
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (ReanalysisSection reanalysisSection : reanalysisSections) {
|
||||
for (SectionText reanalysisSection : reanalysisSections) {
|
||||
|
||||
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
|
||||
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
|
||||
@ -254,7 +213,7 @@ public class ReanalyzeService {
|
||||
}
|
||||
|
||||
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
|
||||
for (int page = 1; page <= pdDocument.getNumberOfPages(); page++) {
|
||||
for (int page = 1; page <= text.getNumberOfPages(); page++) {
|
||||
if (entitiesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
|
||||
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
|
||||
@ -269,19 +228,14 @@ public class ReanalyzeService {
|
||||
.getRuleSetId()));
|
||||
}
|
||||
|
||||
Iterator<RedactionLogEntry> itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator();
|
||||
while (itty.hasNext()) {
|
||||
RedactionLogEntry entry = itty.next();
|
||||
if (sectionsToReanaylse.contains(entry.getSectionNumber())) {
|
||||
itty.remove();
|
||||
}
|
||||
}
|
||||
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
|
||||
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
|
||||
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
|
||||
renalyzeRequest.getRedactionLog().getRedactionLogEntry().addAll(newRedactionLogEntries);
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
|
||||
|
||||
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
|
||||
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
|
||||
@ -0,0 +1,94 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.file.management.v1.api.model.ChangeType;
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.file.management.v1.api.model.RedactionChangeLog;
|
||||
import com.iqser.red.service.file.management.v1.api.model.RedactionChangeLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class RedactionChangeLogService {
|
||||
|
||||
private final RedactionStorageService storageStorageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
public RedactionChangeLog createAndStoreChangeLog(String projectId, String fileId, RedactionLog currentRedactionLog) {
|
||||
|
||||
try {
|
||||
RedactionLog previousRedactionLog = storageStorageService.getRedactionLog(projectId, fileId);
|
||||
var changeLog = createChangeLog(currentRedactionLog, previousRedactionLog);
|
||||
storageStorageService.storeObject(projectId, fileId, FileType.REDACTION_CHANGELOG, objectMapper.writeValueAsBytes(changeLog));
|
||||
return changeLog;
|
||||
} catch (Exception e) {
|
||||
log.debug("Previous redaction log not available");
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private RedactionChangeLog createChangeLog(RedactionLog currentRedactionLog, RedactionLog previousRedactionLog) {
|
||||
|
||||
|
||||
if (previousRedactionLog == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<RedactionLogEntry> added = new ArrayList<>(currentRedactionLog.getRedactionLogEntry());
|
||||
added.removeAll(previousRedactionLog.getRedactionLogEntry());
|
||||
|
||||
List<RedactionLogEntry> removed = new ArrayList<>(previousRedactionLog.getRedactionLogEntry());
|
||||
removed.removeAll(currentRedactionLog.getRedactionLogEntry());
|
||||
|
||||
List<RedactionChangeLogEntry> changeLogEntries = added.stream()
|
||||
.map(entry -> convert(entry, ChangeType.ADDED))
|
||||
.collect(Collectors.toList());
|
||||
changeLogEntries.addAll(removed.stream()
|
||||
.map(entry -> convert(entry, ChangeType.REMOVED))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return new RedactionChangeLog(changeLogEntries, currentRedactionLog.getDictionaryVersion(), currentRedactionLog.getRulesVersion(), currentRedactionLog
|
||||
.getRuleSetId(), currentRedactionLog.getFilename());
|
||||
}
|
||||
|
||||
|
||||
private RedactionChangeLogEntry convert(RedactionLogEntry entry, ChangeType changeType) {
|
||||
|
||||
return RedactionChangeLogEntry.builder()
|
||||
.id(entry.getId())
|
||||
.type(entry.getType())
|
||||
.value(entry.getValue())
|
||||
.reason(entry.getReason())
|
||||
.matchedRule(entry.getMatchedRule())
|
||||
.legalBasis(entry.getLegalBasis())
|
||||
.redacted(entry.isRedacted())
|
||||
.isHint(entry.isHint())
|
||||
.isRecommendation(entry.isRecommendation())
|
||||
.section(entry.getSection())
|
||||
.color(entry.getColor())
|
||||
.positions(entry.getPositions())
|
||||
.sectionNumber(entry.getSectionNumber())
|
||||
.manual(entry.isManual())
|
||||
.status(entry.getStatus())
|
||||
.manualRedactionType(entry.getManualRedactionType())
|
||||
.isDictionaryEntry(entry.isDictionaryEntry())
|
||||
.textBefore(entry.getTextBefore())
|
||||
.textAfter(entry.getTextAfter())
|
||||
.comments(entry.getComments())
|
||||
.changeType(changeType)
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,28 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.CellRectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.Comment;
|
||||
import com.iqser.red.service.redaction.v1.model.IdRemoval;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionType;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.Status;
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
@ -34,8 +12,17 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
|
||||
@ -1,25 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
@SuppressWarnings("PMD")
|
||||
@ -46,7 +38,7 @@ public class EntitySearchUtils {
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
if(value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())){
|
||||
if (value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -147,16 +139,16 @@ public class EntitySearchUtils {
|
||||
|
||||
public void addEntitiesWithHigherRank(Set<Entity> entities, Entity found, Dictionary dictionary) {
|
||||
|
||||
if(entities.contains(found)){
|
||||
if (entities.contains(found)) {
|
||||
Entity existing = entities.stream().filter(entity -> entity.equals(found)).findFirst().get();
|
||||
if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())){
|
||||
if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) {
|
||||
entities.remove(found);
|
||||
}
|
||||
}
|
||||
entities.add(found);
|
||||
}
|
||||
|
||||
public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found){
|
||||
public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found) {
|
||||
// HashSet keeps old value but we want the new.
|
||||
entities.removeAll(found);
|
||||
entities.addAll(found);
|
||||
|
||||
@ -1,15 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
@UtilityClass
|
||||
public class IdBuilder {
|
||||
|
||||
@ -26,7 +25,7 @@ public class IdBuilder {
|
||||
}
|
||||
|
||||
|
||||
public String buildId(Rectangle2D rectangle2D, int page){
|
||||
public String buildId(Rectangle2D rectangle2D, int page) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
|
||||
@ -35,5 +34,4 @@ public class IdBuilder {
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
@ -8,8 +10,6 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ResourceLoader {
|
||||
|
||||
@ -27,4 +27,4 @@ public class ResourceLoader {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -7,6 +7,7 @@ public class TextNormalizationUtilities {
|
||||
|
||||
/**
|
||||
* Revert hyphenation due to line breaks.
|
||||
*
|
||||
* @param text Text to be processed.
|
||||
* @return Text without line-break hyphenation.
|
||||
*/
|
||||
@ -14,4 +15,4 @@ public class TextNormalizationUtilities {
|
||||
return text.replaceAll("([^\\s\\d\\-]{2,})[\\-\\u00AD]\\R|\n\r(.+ )", "$1$2");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,28 +1,36 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -36,6 +44,82 @@ public class PdfSegmentationService {
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
|
||||
|
||||
private void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
|
||||
|
||||
try {
|
||||
for (SectionText sectionText : texts) {
|
||||
|
||||
List<TextBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
|
||||
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
|
||||
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
|
||||
.add(sectionArea);
|
||||
}
|
||||
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
List<Integer> cellStarts = new ArrayList<>();
|
||||
for (Integer page : sectionAreasPerPage.keySet()) {
|
||||
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
|
||||
|
||||
PDPage pdPage = pdDocument.getPage(page - 1);
|
||||
PDRectangle cropBox = pdPage.getCropBox();
|
||||
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
|
||||
textStripper.setPageNumber(page);
|
||||
|
||||
int cellStart = 0;
|
||||
for (SectionArea sectionArea : areasOnPage) {
|
||||
|
||||
Rectangle2D rect = null;
|
||||
if (pdPage.getRotation() == 90) {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
|
||||
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
|
||||
} else {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
|
||||
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
|
||||
.getHeight() + 0.001f);
|
||||
}
|
||||
|
||||
textStripper.addRegion(String.valueOf(1), rect);
|
||||
textStripper.extractRegions(pdPage);
|
||||
textStripper.getTextForRegion(String.valueOf(1));
|
||||
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
|
||||
|
||||
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
|
||||
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
|
||||
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
|
||||
|
||||
if (sectionText.isTable()) {
|
||||
Cell cell = new Cell();
|
||||
cell.addTextBlock(textBlock);
|
||||
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
|
||||
cellStarts.add(cellStart);
|
||||
cellStart = cellStart + cell.toString().trim().length() + 1;
|
||||
}
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
textStripper.clearPositions();
|
||||
}
|
||||
|
||||
}
|
||||
sectionText.setTextBlocks(textBlocks);
|
||||
sectionText.setTabularData(tabularData);
|
||||
if (sectionText.isTable()) {
|
||||
sectionText.setCellStarts(cellStarts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public Document parseDocument(PDDocument pdDocument) throws IOException {
|
||||
|
||||
Document document = new Document();
|
||||
@ -82,6 +166,9 @@ public class PdfSegmentationService {
|
||||
sectionsBuilderService.buildSections(document);
|
||||
sectionsBuilderService.addImagesToSections(document);
|
||||
|
||||
// This can be improved an done in one pass, but it's complicated to do right away
|
||||
postProcessSections(pdDocument, document.getSectionText());
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
@ -116,4 +203,4 @@ public class PdfSegmentationService {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,29 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
public class SectionsBuilderService {
|
||||
@ -302,4 +288,4 @@ public class SectionsBuilderService {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,17 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.settings;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
import lombok.Data;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
@Data
|
||||
@ConfigurationProperties("redaction-service")
|
||||
public class RedactionServiceSettings {
|
||||
|
||||
|
||||
private int numberOfSurroundingWords = 3;
|
||||
|
||||
private int surroundingWordsOffsetWindow = 100;
|
||||
|
||||
private boolean enableImageClassification = true;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,109 @@
|
||||
package com.iqser.red.service.redaction.v1.server.storage;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
|
||||
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.core.io.InputStreamResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class RedactionStorageService {
|
||||
|
||||
private final ObjectMapper objectMapper;
|
||||
private final StorageService storageService;
|
||||
|
||||
@SneakyThrows
|
||||
public InputStream getStoredObject(String storageId) {
|
||||
return storageService.getObject(storageId).getInputStream();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void storeObject(String projectId, String fileId, FileType fileType, Object any) {
|
||||
storageService.storeObject(StorageIdUtils.getStorageId(projectId, fileId, fileType), objectMapper.writeValueAsBytes(any));
|
||||
}
|
||||
|
||||
|
||||
public RedactionLog getRedactionLog(String projectId, String fileId) {
|
||||
|
||||
InputStreamResource inputStreamResource;
|
||||
try {
|
||||
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.REDACTION_LOG));
|
||||
} catch (StorageObjectDoesNotExist e) {
|
||||
log.debug("Text not available.");
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Could not convert Text", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Text getText(String projectId, String fileId) {
|
||||
|
||||
InputStreamResource inputStreamResource;
|
||||
try {
|
||||
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.TEXT));
|
||||
} catch (StorageObjectDoesNotExist e) {
|
||||
log.debug("Text not available.");
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
return objectMapper.readValue(inputStreamResource.getInputStream(), Text.class);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Could not convert Text", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public SectionGrid getSectionGrid(String projectId, String fileId) {
|
||||
|
||||
var sectionGrid = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.SECTION_GRID));
|
||||
try {
|
||||
return objectMapper.readValue(sectionGrid.getInputStream(), SectionGrid.class);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Could not convert RedactionLog", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public enum StorageType {
|
||||
PARSED_DOCUMENT(".json");
|
||||
|
||||
@Getter
|
||||
private final String extension;
|
||||
|
||||
}
|
||||
|
||||
public static class StorageIdUtils {
|
||||
|
||||
public static String getStorageId(String projectId, String fileId, FileType fileType) {
|
||||
return projectId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
|
||||
}
|
||||
|
||||
|
||||
public static String getStorageId(String userId, String projectId, String filename) {
|
||||
|
||||
return userId + "/" + projectId + "/" + filename;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@ -1,7 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -25,7 +24,7 @@ public abstract class AbstractTextContainer {
|
||||
}
|
||||
|
||||
public boolean contains(Rectangle other) {
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
public float getHeight() {
|
||||
@ -36,4 +35,4 @@ public abstract class AbstractTextContainer {
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,18 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@ -71,7 +70,4 @@ public class Cell extends Rectangle {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class CleanRulings {
|
||||
|
||||
@ -8,170 +8,171 @@ import java.util.List;
|
||||
@SuppressWarnings("all")
|
||||
public class Rectangle extends Rectangle2D.Float {
|
||||
|
||||
/**
|
||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
||||
*
|
||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
||||
* @deprecated with no replacement
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
||||
@Override public int compare(Rectangle o1, Rectangle o2) {
|
||||
if (o1.equals(o2)) return 0;
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
|
||||
? - java.lang.Double.compare(o1.getX(), o2.getX())
|
||||
: java.lang.Double.compare(o1.getX(), o2.getX());
|
||||
} else {
|
||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
/**
|
||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
||||
* <p>
|
||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
||||
*
|
||||
* @deprecated with no replacement
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
||||
@Override
|
||||
public int compare(Rectangle o1, Rectangle o2) {
|
||||
if (o1.equals(o2)) return 0;
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
|
||||
? -java.lang.Double.compare(o1.getX(), o2.getX())
|
||||
: java.lang.Double.compare(o1.getX(), o2.getX());
|
||||
} else {
|
||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
public Rectangle() {
|
||||
super();
|
||||
}
|
||||
public Rectangle() {
|
||||
super();
|
||||
}
|
||||
|
||||
public Rectangle(float top, float left, float width, float height) {
|
||||
super();
|
||||
this.setRect(left, top, width, height);
|
||||
}
|
||||
public Rectangle(float top, float left, float width, float height) {
|
||||
super();
|
||||
this.setRect(left, top, width, height);
|
||||
}
|
||||
|
||||
public int compareTo(Rectangle other) {
|
||||
return ILL_DEFINED_ORDER.compare(this, other);
|
||||
}
|
||||
/**
|
||||
* @param rectangles
|
||||
* @return minimum bounding box that contains all the rectangles
|
||||
*/
|
||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
||||
float minx = java.lang.Float.MAX_VALUE;
|
||||
float miny = java.lang.Float.MAX_VALUE;
|
||||
float maxx = java.lang.Float.MIN_VALUE;
|
||||
float maxy = java.lang.Float.MIN_VALUE;
|
||||
|
||||
// I'm bad at Java and need this for fancy sorting in
|
||||
// technology.tabula.TextChunk.
|
||||
public int isLtrDominant() {
|
||||
return 0;
|
||||
}
|
||||
for (Rectangle r : rectangles) {
|
||||
minx = (float) Math.min(r.getMinX(), minx);
|
||||
miny = (float) Math.min(r.getMinY(), miny);
|
||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
||||
}
|
||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
||||
}
|
||||
|
||||
public float getArea() {
|
||||
return this.width * this.height;
|
||||
}
|
||||
public int compareTo(Rectangle other) {
|
||||
return ILL_DEFINED_ORDER.compare(this, other);
|
||||
}
|
||||
|
||||
public float verticalOverlap(Rectangle other) {
|
||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
}
|
||||
// I'm bad at Java and need this for fancy sorting in
|
||||
// technology.tabula.TextChunk.
|
||||
public int isLtrDominant() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public boolean verticallyOverlaps(Rectangle other) {
|
||||
return verticalOverlap(other) > 0;
|
||||
}
|
||||
public float getArea() {
|
||||
return this.width * this.height;
|
||||
}
|
||||
|
||||
public float horizontalOverlap(Rectangle other) {
|
||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
}
|
||||
public float verticalOverlap(Rectangle other) {
|
||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
}
|
||||
|
||||
public boolean horizontallyOverlaps(Rectangle other) {
|
||||
return horizontalOverlap(other) > 0;
|
||||
}
|
||||
public boolean verticallyOverlaps(Rectangle other) {
|
||||
return verticalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
public float verticalOverlapRatio(Rectangle other) {
|
||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
||||
public float horizontalOverlap(Rectangle other) {
|
||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
}
|
||||
|
||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
|
||||
&& other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - this.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
|
||||
&& this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - other.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
|
||||
&& other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - other.getTop()) / delta;
|
||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
|
||||
&& this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - this.getTop()) / delta;
|
||||
}
|
||||
public boolean horizontallyOverlaps(Rectangle other) {
|
||||
return horizontalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
return rv;
|
||||
public float verticalOverlapRatio(Rectangle other) {
|
||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
||||
|
||||
}
|
||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
|
||||
&& other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - this.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
|
||||
&& this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - other.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
|
||||
&& other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - other.getTop()) / delta;
|
||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
|
||||
&& this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - this.getTop()) / delta;
|
||||
}
|
||||
|
||||
public float overlapRatio(Rectangle other) {
|
||||
double intersectionWidth = Math.max(0,
|
||||
Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
double intersectionHeight = Math.max(0,
|
||||
Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
||||
return rv;
|
||||
|
||||
return (float) (intersectionArea / unionArea);
|
||||
}
|
||||
}
|
||||
|
||||
public Rectangle merge(Rectangle other) {
|
||||
this.setRect(this.createUnion(other));
|
||||
return this;
|
||||
}
|
||||
public float overlapRatio(Rectangle other) {
|
||||
double intersectionWidth = Math.max(0,
|
||||
Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
double intersectionHeight = Math.max(0,
|
||||
Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
||||
|
||||
public float getTop() {
|
||||
return (float) this.getMinY();
|
||||
}
|
||||
return (float) (intersectionArea / unionArea);
|
||||
}
|
||||
|
||||
public void setTop(float top) {
|
||||
float deltaHeight = top - this.y;
|
||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
||||
}
|
||||
public Rectangle merge(Rectangle other) {
|
||||
this.setRect(this.createUnion(other));
|
||||
return this;
|
||||
}
|
||||
|
||||
public float getRight() {
|
||||
return (float) this.getMaxX();
|
||||
}
|
||||
public float getTop() {
|
||||
return (float) this.getMinY();
|
||||
}
|
||||
|
||||
public void setRight(float right) {
|
||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
||||
}
|
||||
public void setTop(float top) {
|
||||
float deltaHeight = top - this.y;
|
||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
||||
}
|
||||
|
||||
public float getLeft() {
|
||||
return (float) this.getMinX();
|
||||
}
|
||||
public float getRight() {
|
||||
return (float) this.getMaxX();
|
||||
}
|
||||
|
||||
public void setLeft(float left) {
|
||||
float deltaWidth = left - this.x;
|
||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
||||
}
|
||||
public void setRight(float right) {
|
||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
||||
}
|
||||
|
||||
public float getBottom() {
|
||||
return (float) this.getMaxY();
|
||||
}
|
||||
public float getLeft() {
|
||||
return (float) this.getMinX();
|
||||
}
|
||||
|
||||
public void setBottom(float bottom) {
|
||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
||||
}
|
||||
public void setLeft(float left) {
|
||||
float deltaWidth = left - this.x;
|
||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
||||
}
|
||||
|
||||
public Point2D[] getPoints() {
|
||||
return new Point2D[] { new Point2D.Float(this.getLeft(), this.getTop()),
|
||||
new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
|
||||
new Point2D.Float(this.getLeft(), this.getBottom()) };
|
||||
}
|
||||
public float getBottom() {
|
||||
return (float) this.getMaxY();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = super.toString();
|
||||
sb.append(s.substring(0, s.length() - 1));
|
||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
||||
return sb.toString();
|
||||
}
|
||||
public void setBottom(float bottom) {
|
||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param rectangles
|
||||
* @return minimum bounding box that contains all the rectangles
|
||||
*/
|
||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
||||
float minx = java.lang.Float.MAX_VALUE;
|
||||
float miny = java.lang.Float.MAX_VALUE;
|
||||
float maxx = java.lang.Float.MIN_VALUE;
|
||||
float maxy = java.lang.Float.MIN_VALUE;
|
||||
public Point2D[] getPoints() {
|
||||
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()),
|
||||
new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
|
||||
new Point2D.Float(this.getLeft(), this.getBottom())};
|
||||
}
|
||||
|
||||
for (Rectangle r : rectangles) {
|
||||
minx = (float) Math.min(r.getMinX(), minx);
|
||||
miny = (float) Math.min(r.getMinY(), miny);
|
||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
||||
}
|
||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = super.toString();
|
||||
sb.append(s.substring(0, s.length() - 1));
|
||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,12 +1,11 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import org.locationtech.jts.geom.Envelope;
|
||||
import org.locationtech.jts.index.strtree.STRtree;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
public class RectangleSpatialIndex<T extends Rectangle> {
|
||||
|
||||
@ -1,20 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Formatter;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
@Slf4j
|
||||
@ -23,13 +16,127 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
|
||||
|
||||
private enum SOType {VERTICAL, HRIGHT, HLEFT}
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
super(p1, p2);
|
||||
}
|
||||
|
||||
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
|
||||
ArrayList<Ruling> rv = new ArrayList<>();
|
||||
for (Ruling r : rulings) {
|
||||
if (r.intersects(area)) {
|
||||
rv.add(r.intersect(area));
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
// log(n) implementation of find_intersections
|
||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
class SortObject {
|
||||
protected SOType type;
|
||||
protected float position;
|
||||
protected Ruling ruling;
|
||||
|
||||
public SortObject(SOType type, float position, Ruling ruling) {
|
||||
this.type = type;
|
||||
this.position = position;
|
||||
this.ruling = ruling;
|
||||
}
|
||||
}
|
||||
|
||||
List<SortObject> sos = new ArrayList<>();
|
||||
|
||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
||||
@Override
|
||||
public int compare(Ruling o1, Ruling o2) {
|
||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
||||
}
|
||||
});
|
||||
|
||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
||||
@Override
|
||||
public int compare(Point2D o1, Point2D o2) {
|
||||
if (o1.getY() > o2.getY()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getY() < o2.getY()) {
|
||||
return -1;
|
||||
}
|
||||
if (o1.getX() > o2.getX()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getX() < o2.getX()) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
for (Ruling h : horizontals) {
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
}
|
||||
|
||||
for (Ruling v : verticals) {
|
||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
||||
}
|
||||
|
||||
Collections.sort(sos, new Comparator<SortObject>() {
|
||||
@Override
|
||||
public int compare(SortObject a, SortObject b) {
|
||||
int rv;
|
||||
if (Utils.feq(a.position, b.position)) {
|
||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
||||
rv = 1;
|
||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
||||
rv = 1;
|
||||
} else {
|
||||
rv = java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
} else {
|
||||
return java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
});
|
||||
|
||||
for (SortObject so : sos) {
|
||||
switch (so.type) {
|
||||
case VERTICAL:
|
||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
||||
try {
|
||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
||||
if (i == null) {
|
||||
continue;
|
||||
}
|
||||
rv.put(i,
|
||||
new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
|
||||
so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.info("Some line are oblique, ignoring...");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case HRIGHT:
|
||||
tree.remove(so.ruling);
|
||||
break;
|
||||
case HLEFT:
|
||||
tree.put(so.ruling, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
|
||||
public boolean vertical() {
|
||||
return this.length() > 0 && Utils.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
@ -38,13 +145,13 @@ public class Ruling extends Line2D.Float {
|
||||
return this.length() > 0 && Utils.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
|
||||
// attributes that make sense only for non-oblique lines
|
||||
// these are used to have a single collapse method (in page, currently)
|
||||
|
||||
public boolean oblique() {
|
||||
return !(this.vertical() || this.horizontal());
|
||||
}
|
||||
|
||||
// attributes that make sense only for non-oblique lines
|
||||
// these are used to have a single collapse method (in page, currently)
|
||||
|
||||
public float getPosition() {
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
@ -52,7 +159,6 @@ public class Ruling extends Line2D.Float {
|
||||
return this.vertical() ? this.getLeft() : this.getTop();
|
||||
}
|
||||
|
||||
|
||||
public float getStart() {
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
@ -102,12 +208,10 @@ public class Ruling extends Line2D.Float {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean perpendicularTo(Ruling other) {
|
||||
return this.vertical() == other.horizontal();
|
||||
}
|
||||
|
||||
|
||||
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
|
||||
if (this.intersectsLine(another)) {
|
||||
return true;
|
||||
@ -238,7 +342,6 @@ public class Ruling extends Line2D.Float {
|
||||
return angle;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
@ -248,122 +351,7 @@ public class Ruling extends Line2D.Float {
|
||||
return rv;
|
||||
}
|
||||
|
||||
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
|
||||
ArrayList<Ruling> rv = new ArrayList<>();
|
||||
for (Ruling r : rulings) {
|
||||
if (r.intersects(area)) {
|
||||
rv.add(r.intersect(area));
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
// log(n) implementation of find_intersections
|
||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
class SortObject {
|
||||
protected SOType type;
|
||||
protected float position;
|
||||
protected Ruling ruling;
|
||||
|
||||
public SortObject(SOType type, float position, Ruling ruling) {
|
||||
this.type = type;
|
||||
this.position = position;
|
||||
this.ruling = ruling;
|
||||
}
|
||||
}
|
||||
|
||||
List<SortObject> sos = new ArrayList<>();
|
||||
|
||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
||||
@Override
|
||||
public int compare(Ruling o1, Ruling o2) {
|
||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
||||
}
|
||||
});
|
||||
|
||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
||||
@Override
|
||||
public int compare(Point2D o1, Point2D o2) {
|
||||
if (o1.getY() > o2.getY()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getY() < o2.getY()) {
|
||||
return -1;
|
||||
}
|
||||
if (o1.getX() > o2.getX()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getX() < o2.getX()) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
for (Ruling h : horizontals) {
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
}
|
||||
|
||||
for (Ruling v : verticals) {
|
||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
||||
}
|
||||
|
||||
Collections.sort(sos, new Comparator<SortObject>() {
|
||||
@Override
|
||||
public int compare(SortObject a, SortObject b) {
|
||||
int rv;
|
||||
if (Utils.feq(a.position, b.position)) {
|
||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
||||
rv = 1;
|
||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
||||
rv = 1;
|
||||
} else {
|
||||
rv = java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
} else {
|
||||
return java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
});
|
||||
|
||||
for (SortObject so : sos) {
|
||||
switch (so.type) {
|
||||
case VERTICAL:
|
||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
||||
try {
|
||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
||||
if (i == null) {
|
||||
continue;
|
||||
}
|
||||
rv.put(i,
|
||||
new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
|
||||
so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
||||
} catch(UnsupportedOperationException e){
|
||||
log.info("Some line are oblique, ignoring...");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case HRIGHT:
|
||||
tree.remove(so.ruling);
|
||||
break;
|
||||
case HLEFT:
|
||||
tree.put(so.ruling, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
private enum SOType {VERTICAL, HRIGHT, HLEFT}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,22 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@Slf4j
|
||||
public class Table extends AbstractTextContainer {
|
||||
@ -24,21 +15,14 @@ public class Table extends AbstractTextContainer {
|
||||
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
||||
|
||||
private final RectangleSpatialIndex<Cell> si = new RectangleSpatialIndex<>();
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@Setter
|
||||
private String headline;
|
||||
|
||||
private int unrotatedRowCount;
|
||||
|
||||
private int unrotatedColCount;
|
||||
|
||||
private int rowCount = -1;
|
||||
|
||||
private int colCount = -1;
|
||||
|
||||
private final int rotation;
|
||||
|
||||
private List<List<Cell>> rows;
|
||||
|
||||
|
||||
@ -62,8 +46,8 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
// Ignore rows that does not contain any cells and values.
|
||||
List<List<Cell>> rowsToRemove = new ArrayList<>();
|
||||
for (List<Cell> row: rows){
|
||||
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()){
|
||||
for (List<Cell> row : rows) {
|
||||
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
|
||||
rowsToRemove.add(row);
|
||||
}
|
||||
}
|
||||
@ -110,7 +94,7 @@ public class Table extends AbstractTextContainer {
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
if(rowCells.size() == 1){
|
||||
if (rowCells.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -275,7 +259,7 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2),
|
||||
Utils.round(arg1
|
||||
.getBottom(), 2))));
|
||||
.getBottom(), 2))));
|
||||
|
||||
Iterator<Cell> iter = cells.iterator();
|
||||
Cell c = iter.next();
|
||||
@ -367,4 +351,4 @@ public class Table extends AbstractTextContainer {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,19 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.*;
|
||||
|
||||
@Service
|
||||
public class RulingCleaningService {
|
||||
|
||||
@ -1,31 +1,57 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = Utils.round(arg0.getX(), 2);
|
||||
float arg0Y = Utils.round(arg0.getY(), 2);
|
||||
float arg1X = Utils.round(arg1.getX(), 2);
|
||||
float arg1Y = Utils.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
} else if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = Utils.round(arg0.getX(), 2);
|
||||
float arg0Y = Utils.round(arg0.getY(), 2);
|
||||
float arg1X = Utils.round(arg1.getX(), 2);
|
||||
float arg1Y = Utils.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
} else if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
public void extractTables(CleanRulings cleanRulings, Page page) {
|
||||
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
@ -80,7 +106,6 @@ public class TableExtractionService {
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
|
||||
|
||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<Cell> cellsFound = new ArrayList<>();
|
||||
@ -133,7 +158,6 @@ public class TableExtractionService {
|
||||
return cellsFound;
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
@ -233,47 +257,6 @@ public class TableExtractionService {
|
||||
return rectangles;
|
||||
}
|
||||
|
||||
|
||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = Utils.round(arg0.getX(), 2);
|
||||
float arg0Y = Utils.round(arg0.getY(), 2);
|
||||
float arg1X = Utils.round(arg1.getX(), 2);
|
||||
float arg1Y = Utils.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
} else if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = Utils.round(arg0.getX(), 2);
|
||||
float arg0Y = Utils.round(arg0.getY(), 2);
|
||||
float arg1X = Utils.round(arg1.getX(), 2);
|
||||
float arg1Y = Utils.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
} else if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
private enum Direction {
|
||||
HORIZONTAL, VERTICAL
|
||||
}
|
||||
|
||||
@ -19,21 +19,24 @@ import java.awt.geom.Rectangle2D;
|
||||
* clipping algorithm (line against clip rectangle).
|
||||
*/
|
||||
@SuppressWarnings("all")
|
||||
public final class CohenSutherlandClipping
|
||||
{
|
||||
public final class CohenSutherlandClipping {
|
||||
private static final int INSIDE = 0;
|
||||
private static final int LEFT = 1;
|
||||
private static final int RIGHT = 2;
|
||||
private static final int BOTTOM = 4;
|
||||
private static final int TOP = 8;
|
||||
private double xMin;
|
||||
private double yMin;
|
||||
private double xMax;
|
||||
private double yMax;
|
||||
|
||||
/**
|
||||
* Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0).
|
||||
*/
|
||||
public CohenSutherlandClipping() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a Cohen Sutherland clipper with the given clip rectangle.
|
||||
*
|
||||
* @param clip the clip rectangle to use
|
||||
*/
|
||||
public CohenSutherlandClipping(Rectangle2D clip) {
|
||||
@ -42,6 +45,7 @@ public final class CohenSutherlandClipping
|
||||
|
||||
/**
|
||||
* Sets the clip rectangle.
|
||||
*
|
||||
* @param clip the clip rectangle
|
||||
*/
|
||||
public void setClip(Rectangle2D clip) {
|
||||
@ -51,19 +55,13 @@ public final class CohenSutherlandClipping
|
||||
yMax = yMin + clip.getHeight();
|
||||
}
|
||||
|
||||
private static final int INSIDE = 0;
|
||||
private static final int LEFT = 1;
|
||||
private static final int RIGHT = 2;
|
||||
private static final int BOTTOM = 4;
|
||||
private static final int TOP = 8;
|
||||
|
||||
private final int regionCode(double x, double y) {
|
||||
int code = x < xMin
|
||||
? LEFT
|
||||
: x > xMax
|
||||
int code = x < xMin
|
||||
? LEFT
|
||||
: x > xMax
|
||||
? RIGHT
|
||||
: INSIDE;
|
||||
if (y < yMin) code |= BOTTOM;
|
||||
if (y < yMin) code |= BOTTOM;
|
||||
else if (y > yMax) code |= TOP;
|
||||
return code;
|
||||
}
|
||||
@ -71,6 +69,7 @@ public final class CohenSutherlandClipping
|
||||
/**
|
||||
* Clips a given line against the clip rectangle.
|
||||
* The modification (if needed) is done in place.
|
||||
*
|
||||
* @param line the line to clip
|
||||
* @return true if line is clipped, false if line is
|
||||
* totally outside the clip rect.
|
||||
@ -87,9 +86,9 @@ public final class CohenSutherlandClipping
|
||||
|
||||
boolean vertical = p1x == p2x;
|
||||
|
||||
double slope = vertical
|
||||
? 0d
|
||||
: (p2y-p1y)/(p2x-p1x);
|
||||
double slope = vertical
|
||||
? 0d
|
||||
: (p2y - p1y) / (p2x - p1x);
|
||||
|
||||
int c1 = regionCode(p1x, p1y);
|
||||
int c2 = regionCode(p2x, p2y);
|
||||
@ -103,31 +102,27 @@ public final class CohenSutherlandClipping
|
||||
|
||||
if ((c & LEFT) != INSIDE) {
|
||||
qx = xMin;
|
||||
qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
|
||||
}
|
||||
else if ((c & RIGHT) != INSIDE) {
|
||||
qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
|
||||
} else if ((c & RIGHT) != INSIDE) {
|
||||
qx = xMax;
|
||||
qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
|
||||
}
|
||||
else if ((c & BOTTOM) != INSIDE) {
|
||||
qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
|
||||
} else if ((c & BOTTOM) != INSIDE) {
|
||||
qy = yMin;
|
||||
qx = vertical
|
||||
? p1x
|
||||
: (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
|
||||
}
|
||||
else if ((c & TOP) != INSIDE) {
|
||||
? p1x
|
||||
: (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
|
||||
} else if ((c & TOP) != INSIDE) {
|
||||
qy = yMax;
|
||||
qx = vertical
|
||||
? p1x
|
||||
: (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
|
||||
? p1x
|
||||
: (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
|
||||
}
|
||||
|
||||
if (c == c1) {
|
||||
p1x = qx;
|
||||
p1y = qy;
|
||||
c1 = regionCode(p1x, p1y);
|
||||
}
|
||||
else {
|
||||
c1 = regionCode(p1x, p1y);
|
||||
} else {
|
||||
p2x = qx;
|
||||
p2y = qy;
|
||||
c2 = regionCode(p2x, p2y);
|
||||
@ -137,4 +132,4 @@ public final class CohenSutherlandClipping
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// end of file
|
||||
// end of file
|
||||
|
||||
@ -10,11 +10,6 @@ import java.util.List;
|
||||
*/
|
||||
public final class QuickSort {
|
||||
|
||||
private QuickSort() {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static final Comparator<? extends Comparable> OBJCOMP = new Comparator<Comparable>() {
|
||||
@Override
|
||||
public int compare(Comparable object1, Comparable object2) {
|
||||
@ -24,6 +19,10 @@ public final class QuickSort {
|
||||
};
|
||||
|
||||
|
||||
private QuickSort() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Sorts the given list using the given comparator.
|
||||
*
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@SuppressWarnings("all")
|
||||
public class Utils {
|
||||
|
||||
@ -1,15 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.visualization.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
@ -17,9 +7,17 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -34,7 +32,7 @@ public class PdfVisualisationService {
|
||||
PDPage pdPage = document.getPage(page - 1);
|
||||
PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
for(Paragraph paragraph : classifiedDoc.getParagraphs()) {
|
||||
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
|
||||
|
||||
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
|
||||
|
||||
@ -44,10 +42,10 @@ public class PdfVisualisationService {
|
||||
continue;
|
||||
}
|
||||
if (textBlock instanceof TextBlock) {
|
||||
textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size());
|
||||
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
|
||||
visualizeTextBlock((TextBlock) textBlock, contentStream);
|
||||
} else if (textBlock instanceof Table) {
|
||||
textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size());
|
||||
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
|
||||
visualizeTable((Table) textBlock, contentStream);
|
||||
}
|
||||
|
||||
@ -59,7 +57,6 @@ public class PdfVisualisationService {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void visualizeClassifications(Document classifiedDoc, PDDocument document) throws IOException {
|
||||
|
||||
for (int page = 1; page <= document.getNumberOfPages(); page++) {
|
||||
|
||||
@ -1,4 +1,11 @@
|
||||
server:
|
||||
port: 8083
|
||||
|
||||
configuration-service.url: "http://localhost:8081"
|
||||
configuration-service.url: "http://localhost:8081"
|
||||
|
||||
|
||||
storage:
|
||||
bucket-name: 'redaction'
|
||||
endpoint: 'http://localhost:9000'
|
||||
key: minioadmin
|
||||
secret: minioadmin
|
||||
|
||||
@ -17,4 +17,11 @@ management:
|
||||
prometheus.enabled: ${monitoring.enabled:false}
|
||||
health.enabled: true
|
||||
endpoints.web.exposure.include: prometheus, health
|
||||
metrics.export.prometheus.enabled: ${monitoring.enabled:false}
|
||||
metrics.export.prometheus.enabled: ${monitoring.enabled:false}
|
||||
|
||||
|
||||
storage:
|
||||
signer-type: 'AWSS3V4SignerType'
|
||||
bucket-name: 'redaction'
|
||||
region: 'us-east-1'
|
||||
endpoint: 'https://s3.amazonaws.com'
|
||||
|
||||
@ -0,0 +1,34 @@
|
||||
package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import org.springframework.core.io.InputStreamResource;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class InMemoryStorageService extends StorageService {
|
||||
|
||||
private Map<String, byte[]> dataMap = new HashMap<>();
|
||||
|
||||
public InMemoryStorageService() {
|
||||
super(null, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStreamResource getObject(String objectId) {
|
||||
|
||||
var res = dataMap.get(objectId);
|
||||
if (res == null) {
|
||||
throw new StorageObjectDoesNotExist(new RuntimeException());
|
||||
}
|
||||
return new InputStreamResource(new ByteArrayInputStream(res));
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void storeObject(String objectId, byte[] data) {
|
||||
dataMap.put(objectId, data);
|
||||
}
|
||||
}
|
||||
@ -1,28 +1,20 @@
|
||||
package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.amazonaws.services.s3.AmazonS3;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.configuration.v1.api.model.*;
|
||||
import com.iqser.red.service.file.management.v1.api.model.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
@ -37,40 +29,20 @@ import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.context.TestConfiguration;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.configuration.v1.api.model.Colors;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
|
||||
import com.iqser.red.service.redaction.v1.model.Comment;
|
||||
import com.iqser.red.service.redaction.v1.model.IdRemoval;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.model.Status;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT;
|
||||
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest(webEnvironment = RANDOM_PORT)
|
||||
@ -116,6 +88,15 @@ public class RedactionIntegrationTest {
|
||||
@MockBean
|
||||
private ImageClassificationClient imageClassificationClient;
|
||||
|
||||
@Autowired
|
||||
private RedactionStorageService redactionStorageService;
|
||||
|
||||
@Autowired
|
||||
private StorageService storageService;
|
||||
|
||||
@MockBean
|
||||
private AmazonS3 amazonS3;
|
||||
|
||||
private final Map<String, List<String>> dictionary = new HashMap<>();
|
||||
private final Map<String, String> typeColorMap = new HashMap<>();
|
||||
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
|
||||
@ -126,6 +107,8 @@ public class RedactionIntegrationTest {
|
||||
private final Map<String, Long> reanlysisVersions = new HashMap<>();
|
||||
|
||||
private final static String TEST_RULESET_ID = "123";
|
||||
private final static String TEST_PROJECT_ID = "123";
|
||||
private final static String TEST_FILE_ID = "123";
|
||||
|
||||
@TestConfiguration
|
||||
public static class RedactionIntegrationTestConfiguration {
|
||||
@ -146,6 +129,12 @@ public class RedactionIntegrationTest {
|
||||
return kieServices.newKieContainer(kieModule.getReleaseId());
|
||||
}
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
return new InMemoryStorageService();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -464,15 +453,16 @@ public class RedactionIntegrationTest {
|
||||
input.addAll(getPathsRecursively(file));
|
||||
}
|
||||
for (File path : input) {
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(new FileInputStream(path)))
|
||||
.build();
|
||||
|
||||
AnalyzeRequest request = prepareStorage(new FileInputStream((path)));
|
||||
System.out.println("Redacting file : " + path.getName());
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
|
||||
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
|
||||
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
|
||||
|
||||
redactionLog.getRedactionLogEntry().forEach(entry -> {
|
||||
duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry);
|
||||
});
|
||||
|
||||
@ -484,13 +474,7 @@ public class RedactionIntegrationTest {
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L);
|
||||
|
||||
long rstart = System.currentTimeMillis();
|
||||
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.document(IOUtils.toByteArray(new FileInputStream(path)))
|
||||
.manualRedactions(null)
|
||||
.text(result.getText())
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.build());
|
||||
redactionController.reanalyze(request);
|
||||
|
||||
long rend = System.currentTimeMillis();
|
||||
System.out.println("reanalysis analysis duration: " + (rend - rstart));
|
||||
@ -528,15 +512,14 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
|
||||
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
|
||||
var text = redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID);
|
||||
|
||||
redactionLog.getRedactionLogEntry().forEach(entry -> {
|
||||
if (entry.isImage()) {
|
||||
System.out.println("---->" + entry.getType());
|
||||
}
|
||||
@ -547,13 +530,13 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("first analysis duration: " + (end - start));
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) {
|
||||
fileOutputStream.write(objectMapper.writeValueAsBytes(result.getText()));
|
||||
fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID)));
|
||||
}
|
||||
|
||||
int correctFound = 0;
|
||||
loop:
|
||||
for (RedactionLogEntry redactionLogEntry : result.getRedactionLog().getRedactionLogEntry()) {
|
||||
for (SectionText sectionText : result.getText().getSectionTexts()) {
|
||||
for (RedactionLogEntry redactionLogEntry : redactionLog.getRedactionLogEntry()) {
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
if (redactionLogEntry.isImage()) {
|
||||
correctFound++;
|
||||
continue loop;
|
||||
@ -569,7 +552,7 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
}
|
||||
}
|
||||
assertThat(correctFound).isEqualTo(result.getRedactionLog().getRedactionLogEntry().size());
|
||||
assertThat(correctFound).isEqualTo(redactionLog.getRedactionLogEntry().size());
|
||||
|
||||
dictionary.get(AUTHOR).add("properties");
|
||||
reanlysisVersions.put("properties", 1L);
|
||||
@ -585,20 +568,14 @@ public class RedactionIntegrationTest {
|
||||
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(VERTEBRATE));
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.text(result.getText())
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.build());
|
||||
AnalyzeResult reanalyzeResult = redactionController.reanalyze(request);
|
||||
|
||||
end = System.currentTimeMillis();
|
||||
System.out.println("reanalysis analysis duration: " + (end - start));
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(reanalyzeResult.getRedactionLog())
|
||||
.sectionGrid(result.getSectionGrid())
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
|
||||
@ -613,19 +590,13 @@ public class RedactionIntegrationTest {
|
||||
|
||||
System.out.println("testTableRedaction");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
|
||||
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.sectionGrid(result.getSectionGrid())
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
|
||||
@ -680,12 +651,9 @@ public class RedactionIntegrationTest {
|
||||
|
||||
// manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.manualRedactions(manualRedactions)
|
||||
.build();
|
||||
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
request.setManualRedactions(manualRedactions);
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
|
||||
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
|
||||
@ -694,20 +662,15 @@ public class RedactionIntegrationTest {
|
||||
.status(Status.APPROVED)
|
||||
.build()));
|
||||
|
||||
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.manualRedactions(manualRedactions)
|
||||
.text(result.getText())
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.build());
|
||||
redactionController.reanalyze(request);
|
||||
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(reanalyzeResult.getRedactionLog())
|
||||
.sectionGrid(result.getSectionGrid())
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
|
||||
fileOutputStream.write(annotateResponse.getDocument());
|
||||
}
|
||||
@ -724,11 +687,16 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("classificationTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.projectId(request.getProjectId())
|
||||
.fileId(request.getFileId())
|
||||
.ruleSetId(request.getRuleSetId())
|
||||
.build();
|
||||
|
||||
RedactionResult result = redactionController.classify(request);
|
||||
RedactionResult result = redactionController.classify(redactionRequest);
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Classified.pdf")) {
|
||||
fileOutputStream.write(result.getDocument());
|
||||
@ -742,11 +710,15 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("sectionsTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.projectId(request.getProjectId())
|
||||
.fileId(request.getFileId())
|
||||
.ruleSetId(request.getRuleSetId())
|
||||
.build();
|
||||
|
||||
RedactionResult result = redactionController.sections(request);
|
||||
RedactionResult result = redactionController.sections(redactionRequest);
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Sections.pdf")) {
|
||||
fileOutputStream.write(result.getDocument());
|
||||
@ -760,11 +732,15 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("htmlTablesTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.projectId(request.getProjectId())
|
||||
.fileId(request.getFileId())
|
||||
.ruleSetId(request.getRuleSetId())
|
||||
.build();
|
||||
|
||||
RedactionResult result = redactionController.htmlTables(request);
|
||||
RedactionResult result = redactionController.htmlTables(redactionRequest);
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) {
|
||||
fileOutputStream.write(result.getDocument());
|
||||
@ -778,11 +754,15 @@ public class RedactionIntegrationTest {
|
||||
System.out.println("htmlTableRotationTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.projectId(request.getProjectId())
|
||||
.fileId(request.getFileId())
|
||||
.ruleSetId(request.getRuleSetId())
|
||||
.build();
|
||||
|
||||
RedactionResult result = redactionController.htmlTables(request);
|
||||
RedactionResult result = redactionController.htmlTables(redactionRequest);
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) {
|
||||
fileOutputStream.write(result.getDocument());
|
||||
@ -795,20 +775,45 @@ public class RedactionIntegrationTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Phantom Cells.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
|
||||
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
|
||||
|
||||
redactionLog.getRedactionLogEntry().forEach(entry -> {
|
||||
if (!entry.isHint()) {
|
||||
assertThat(entry.getReason()).isEqualTo("Not redacted because row is not a vertebrate study");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private AnalyzeRequest prepareStorage(String file) {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(file);
|
||||
|
||||
return prepareStorage(pdfFileResource.getInputStream());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private AnalyzeRequest prepareStorage(InputStream stream) {
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.lastProcessed(OffsetDateTime.now())
|
||||
.build();
|
||||
|
||||
var bytes = IOUtils.toByteArray(stream);
|
||||
|
||||
storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_PROJECT_ID, TEST_FILE_ID, FileType.ORIGIN), bytes);
|
||||
|
||||
return request;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void sponsorCompanyTest() throws IOException {
|
||||
@ -816,17 +821,14 @@ public class RedactionIntegrationTest {
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/sponsor_companies.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.sectionGrid(result.getSectionGrid())
|
||||
.projectId(TEST_PROJECT_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
|
||||
@ -857,4 +859,4 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,12 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.Colors;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.amazonaws.services.s3.AmazonS3;
|
||||
import com.iqser.red.service.configuration.v1.api.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.InMemoryStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
@ -14,7 +10,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
@ -30,6 +26,7 @@ import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.context.TestConfiguration;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
@ -40,15 +37,8 @@ import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
@ -80,6 +70,9 @@ public class EntityRedactionServiceTest {
|
||||
@Autowired
|
||||
private DroolsExecutionService droolsExecutionService;
|
||||
|
||||
@MockBean
|
||||
private AmazonS3 amazonS3;
|
||||
|
||||
private final static String TEST_RULESET_ID = "123";
|
||||
|
||||
@TestConfiguration
|
||||
@ -101,6 +94,13 @@ public class EntityRedactionServiceTest {
|
||||
return kieServices.newKieContainer(kieModule.getReleaseId());
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
return new InMemoryStorageService();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -125,10 +125,6 @@ public class EntityRedactionServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
|
||||
.build();
|
||||
@ -144,7 +140,7 @@ public class EntityRedactionServiceTest {
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
@ -158,10 +154,6 @@ public class EntityRedactionServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/nested_redaction.pdf");
|
||||
|
||||
RedactionRequest redactionRequest = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
|
||||
.build();
|
||||
@ -176,7 +168,7 @@ public class EntityRedactionServiceTest {
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
@ -526,4 +518,4 @@ public class EntityRedactionServiceTest {
|
||||
return dictionaryEntries;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,16 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import com.amazonaws.services.s3.AmazonS3;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
@ -22,15 +20,15 @@ import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
import javax.imageio.ImageIO;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
@SpringBootTest
|
||||
@RunWith(SpringRunner.class)
|
||||
@ -51,6 +49,8 @@ public class PdfSegmentationServiceTest {
|
||||
@MockBean
|
||||
private KieContainer kieContainer;
|
||||
|
||||
@MockBean
|
||||
private AmazonS3 amazonS3;
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
@ -76,6 +76,29 @@ public class PdfSegmentationServiceTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testPDFSegmentationWithComplexTable() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table table = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testTableExtraction() throws IOException {
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user