Reworked re-analysis and analysis to use memory model / directly read/store files, and dumped pd doc wherever possible

This commit is contained in:
Timo 2021-04-16 14:39:43 +03:00
parent ed59f36220
commit 5cb4ea287c
86 changed files with 1473 additions and 1407 deletions

View File

@ -5,7 +5,7 @@
<parent>
<artifactId>platform-dependency</artifactId>
<groupId>com.iqser.red</groupId>
<version>1.0.8</version>
<version>1.1.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -32,7 +32,7 @@
<dependency>
<groupId>com.iqser.red</groupId>
<artifactId>platform-commons-dependency</artifactId>
<version>1.2.5</version>
<version>1.2.9</version>
<scope>import</scope>
<type>pom</type>
</dependency>
@ -52,4 +52,4 @@
</dependencyManagement>
</project>
</project>

View File

@ -5,13 +5,19 @@ import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class AnalyzeRequest {
private byte[] document;
private String projectId;
private String fileId;
private String ruleSetId;
private ManualRedactions manualRedactions;
private OffsetDateTime lastProcessed;
}

View File

@ -12,8 +12,11 @@ import lombok.NoArgsConstructor;
public class AnalyzeResult {
private int numberOfPages;
private RedactionLog redactionLog;
private SectionGrid sectionGrid;
private Text text;
private boolean hasHints;
private boolean hasRequests;
private boolean hasRedactions;
private boolean hasImages;
private boolean hasUpdates;
}

View File

@ -11,7 +11,6 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class AnnotateRequest {
private byte[] document;
private RedactionLog redactionLog;
private SectionGrid sectionGrid;
private String projectId;
private String fileId;
}

View File

@ -1,12 +1,12 @@
package com.iqser.red.service.redaction.v1.model;
import java.time.OffsetDateTime;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
@Data
@Builder
@AllArgsConstructor

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@Builder
@AllArgsConstructor

View File

@ -1,16 +1,16 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@Builder
@NoArgsConstructor

View File

@ -11,7 +11,8 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class RedactionRequest {
private byte[] document;
private String projectId;
private String fileId;
private String ruleSetId;
private ManualRedactions manualRedactions;
}

View File

@ -13,7 +13,5 @@ public class RedactionResult {
private byte[] document;
private int numberOfPages;
private RedactionLog redactionLog;
private SectionGrid sectionGrid;
}

View File

@ -1,22 +0,0 @@
package com.iqser.red.service.redaction.v1.model;
import java.time.OffsetDateTime;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class RenalyzeRequest {
private byte[] document;
private String ruleSetId;
private ManualRedactions manualRedactions;
private Text text;
private RedactionLog redactionLog;
private OffsetDateTime lastProcessed;
}

View File

@ -27,7 +27,7 @@ public class SectionArea {
private String header;
public boolean contains(Rectangle other) {
return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight();
return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight();
}
}

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Data
@AllArgsConstructor
@NoArgsConstructor

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import java.util.List;
@Data
@AllArgsConstructor
@NoArgsConstructor

View File

@ -1,25 +0,0 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SectionText {
private int sectionNumber;
private String text;
private boolean isTable;
private String headline;
private List<SectionArea> sectionAreas = new ArrayList<>();
}

View File

@ -1,14 +1,6 @@
package com.iqser.red.service.redaction.v1.resources;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.*;
import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
@ -25,7 +17,7 @@ public interface RedactionResource {
AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest);
@PostMapping(value = "/reanalyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest);
AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest);
@PostMapping(value = "/annotate", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest);
@ -39,10 +31,10 @@ public interface RedactionResource {
@PostMapping(value = "/debug/htmlTables", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest);
@PostMapping(value = "/rules/update"+RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE)
@PostMapping(value = "/rules/update" + RULE_SET_PATH_VARIABLE, consumes = MediaType.APPLICATION_JSON_VALUE)
void updateRules(@PathVariable(RULE_SET_PARAMETER_NAME) String ruleSetId);
@PostMapping(value = "/rules/test", consumes = MediaType.APPLICATION_JSON_VALUE)
void testRules(@RequestBody String rules);
}
}

View File

@ -12,6 +12,10 @@
<artifactId>redaction-service-server-v1</artifactId>
<dependencies>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>storage-commons</artifactId>
</dependency>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>redaction-service-api-v1</artifactId>
@ -20,7 +24,12 @@
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>configuration-service-api-v1</artifactId>
<version>2.2.9</version>
<version>2.5.0</version>
</dependency>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>file-management-service-api-v1</artifactId>
<version>2.6.7</version>
</dependency>
<dependency>
<groupId>org.drools</groupId>

View File

@ -1,5 +1,8 @@
package com.iqser.red.service.redaction.v1.server;
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@ -8,10 +11,6 @@ import org.springframework.boot.context.properties.EnableConfigurationProperties
import org.springframework.cloud.openfeign.EnableFeignClients;
import org.springframework.context.annotation.Import;
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
@Import({DefaultWebMvcConfiguration.class})
@EnableFeignClients(basePackageClasses = RulesClient.class)
@EnableConfigurationProperties(RedactionServiceSettings.class)
@ -23,4 +22,4 @@ public class Application {
}
}
}

View File

@ -1,20 +1,18 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class Document {

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import lombok.Getter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@ -7,38 +9,35 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import lombok.Getter;
public class FloatFrequencyCounter
{
public class FloatFrequencyCounter {
@Getter
Map<Float, Integer> countPerValue = new HashMap<>();
public void add(float value){
if(!countPerValue.containsKey(value)){
public void add(float value) {
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
} else {
countPerValue.put(value, countPerValue.get(value) + 1);
}
}
public void addAll(Map<Float, Integer> otherCounter){
for(Map.Entry<Float, Integer> entry: otherCounter.entrySet()){
if(countPerValue.containsKey(entry.getKey())){
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey())+ entry.getValue());
public void addAll(Map<Float, Integer> otherCounter) {
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else {
countPerValue.put(entry.getKey(), entry.getValue());
}
}
}
public Float getMostPopular(){
public Float getMostPopular() {
Map.Entry<Float, Integer> mostPopular = null;
for(Map.Entry<Float, Integer> entry: countPerValue.entrySet()){
if(mostPopular == null){
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null) {
mostPopular = entry;
} else if(entry.getValue() >= mostPopular.getValue()){
} else if (entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
}
@ -46,12 +45,11 @@ public class FloatFrequencyCounter
}
public List<Float> getHighterThanMostPopular(){
public List<Float> getHighterThanMostPopular() {
Float mostPopular = getMostPopular();
List<Float> higher = new ArrayList<>();
for(Float value: countPerValue.keySet()){
if(value > mostPopular){
for (Float value : countPerValue.keySet()) {
if (value > mostPopular) {
higher.add(value);
}
}
@ -60,12 +58,12 @@ public class FloatFrequencyCounter
}
public Float getHighest(){
public Float getHighest() {
Float highest = null;
for(Float value: countPerValue.keySet()){
if (highest == null){
for (Float value : countPerValue.keySet()) {
if (highest == null) {
highest = value;
} else if(value > highest){
} else if (value > highest) {
highest = value;
}
}

View File

@ -1,12 +1,11 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.List;
@Data
@AllArgsConstructor
public class Footer {
@ -21,4 +20,4 @@ public class Footer {
return searchableText;
}
}
}

View File

@ -1,12 +1,11 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.List;
@Data
@AllArgsConstructor
public class Header {
@ -21,4 +20,4 @@ public class Header {
return searchableText;
}
}
}

View File

@ -1,15 +1,14 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import java.util.List;
@Data
@RequiredArgsConstructor
public class Page {
@ -37,4 +36,4 @@ public class Page {
return rotation != 0;
}
}
}

View File

@ -1,19 +1,18 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@NoArgsConstructor
public class Paragraph implements Comparable{
public class Paragraph implements Comparable {
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
private List<PdfImage> images = new ArrayList<>();
@ -62,4 +61,4 @@ public class Paragraph implements Comparable{
return 0;
}
}
}

View File

@ -0,0 +1,45 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.*;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SectionText {
private int sectionNumber;
private String text;
private boolean isTable;
private String headline;
private List<SectionArea> sectionAreas = new ArrayList<>();
private Set<Image> images = new HashSet<>();
private List<TextBlock> textBlocks = new ArrayList<>();
private Map<String, CellValue> tabularData = new HashMap<>();
private List<Integer> cellStarts = new ArrayList<>();
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> {
if (block != null) {
searchableText.addAll(block.getSequences());
}
});
return searchableText;
}
}

View File

@ -1,10 +1,10 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import lombok.Getter;
import java.util.HashMap;
import java.util.Map;
import lombok.Getter;
public class StringFrequencyCounter {
@Getter
@ -46,4 +46,4 @@ public class StringFrequencyCounter {
return mostPopular != null ? mostPopular.getKey() : null;
}
}
}

View File

@ -1,17 +1,18 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
package com.iqser.red.service.redaction.v1.server.classification.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Text {
private int numberOfPages;
private List<SectionText> sectionTexts = new ArrayList<>();
}

View File

@ -1,16 +1,15 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import java.util.ArrayList;
import java.util.List;
@AllArgsConstructor
@Builder
@Data
@ -98,7 +97,6 @@ public class TextBlock extends AbstractTextContainer {
}
@Override
public String toString() {
@ -139,4 +137,4 @@ public class TextBlock extends AbstractTextContainer {
}
}
}

View File

@ -1,12 +1,11 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.List;
@Data
@AllArgsConstructor
public class UnclassifiedText {
@ -21,4 +20,4 @@ public class UnclassifiedText {
return searchableText;
}
}
}

View File

@ -1,21 +1,20 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
@Service
@SuppressWarnings("all")

View File

@ -1,19 +1,17 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
import java.util.List;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.List;
import java.util.regex.Pattern;
@Slf4j
@Service

View File

@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.classification.utils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.experimental.UtilityClass;
@UtilityClass

View File

@ -1,16 +1,16 @@
package com.iqser.red.service.redaction.v1.server.client;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.springframework.lang.NonNull;
import org.springframework.lang.Nullable;
import org.springframework.util.Assert;
import org.springframework.util.FileCopyUtils;
import org.springframework.web.multipart.MultipartFile;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
public class MockMultipartFile implements MultipartFile {
private final String name;
@ -22,13 +22,13 @@ public class MockMultipartFile implements MultipartFile {
public MockMultipartFile(String name, @Nullable byte[] content) {
this(name, "", (String) null, (byte[]) content);
this(name, "", null, content);
}
public MockMultipartFile(String name, InputStream contentStream) throws IOException {
this(name, "", (String) null, (byte[]) FileCopyUtils.copyToByteArray(contentStream));
this(name, "", null, FileCopyUtils.copyToByteArray(contentStream));
}
@ -78,7 +78,7 @@ public class MockMultipartFile implements MultipartFile {
public long getSize() {
return (long) this.content.length;
return this.content.length;
}

View File

@ -1,17 +1,15 @@
package com.iqser.red.service.redaction.v1.server.controller;
import java.time.OffsetDateTime;
import com.iqser.red.commons.spring.ErrorMessage;
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
import lombok.extern.slf4j.Slf4j;
import org.springframework.http.HttpStatus;
import org.springframework.web.bind.annotation.ExceptionHandler;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.bind.annotation.ResponseStatus;
import org.springframework.web.bind.annotation.RestControllerAdvice;
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
import lombok.extern.slf4j.Slf4j;
import java.time.OffsetDateTime;
@Slf4j
@RestControllerAdvice
@ -38,4 +36,4 @@ public class ControllerAdvice {
return new ErrorMessage(OffsetDateTime.now(), e.getMessage());
}
}
}

View File

@ -1,17 +1,7 @@
package com.iqser.red.service.redaction.v1.server.controller;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.model.Text;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.resources.RedactionResource;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
@ -19,27 +9,21 @@ import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationService;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService;
import com.iqser.red.service.redaction.v1.server.redaction.service.EntityRedactionService;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionLogCreatorService;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RestController;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
@Slf4j
@RestController
@ -47,61 +31,36 @@ import java.util.List;
public class RedactionController implements RedactionResource {
private final PdfVisualisationService pdfVisualisationService;
private final PdfSegmentationService pdfSegmentationService;
private final RedactionLogCreatorService redactionLogCreatorService;
private final EntityRedactionService entityRedactionService;
private final DroolsExecutionService droolsExecutionService;
private final DictionaryService dictionaryService;
private final AnnotationService annotationService;
private final ReanalyzeService reanalyzeService;
private final ImageClassificationService imageClassificationService;
private final PdfSegmentationService pdfSegmentationService;
private final RedactionStorageService redactionStorageService;
@Override
public AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest) {
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(analyzeRequest.getDocument()))) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
log.info("Document structure analysis successful, starting redaction analysis...");
imageClassificationService.classifyImages(classifiedDoc);
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
redactionLogCreatorService.createRedactionLog(classifiedDoc, pdDocument.getNumberOfPages(), analyzeRequest.getManualRedactions(), analyzeRequest
.getRuleSetId());
log.info("Redaction analysis successful...");
return AnalyzeResult.builder()
.sectionGrid(classifiedDoc.getSectionGrid())
.redactionLog(new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion(), classifiedDoc
.getRulesVersion(), analyzeRequest.getRuleSetId()))
.numberOfPages(classifiedDoc.getPages().size())
.text(new Text(classifiedDoc.getSectionText()))
.build();
} catch (Exception e) {
throw new RedactionException(e);
}
return reanalyzeService.analyze(analyzeRequest);
}
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
return reanalyzeService.reanalyze(renalyzeRequest);
@Override
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
return reanalyzeService.reanalyze(analyzeRequest);
}
public AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest) {
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(annotateRequest.getDocument()))) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(annotateRequest.getProjectId(), annotateRequest.getFileId(), FileType.ORIGIN));
var redactionLog = redactionStorageService.getRedactionLog(annotateRequest.getProjectId(), annotateRequest.getFileId());
var sectionsGrid = redactionStorageService.getSectionGrid(annotateRequest.getProjectId(), annotateRequest.getFileId());
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
dictionaryService.updateDictionary(annotateRequest.getRedactionLog().getRuleSetId());
annotationService.annotate(pdDocument, annotateRequest.getRedactionLog(), annotateRequest.getSectionGrid());
dictionaryService.updateDictionary(redactionLog.getRuleSetId());
annotationService.annotate(pdDocument, redactionLog, sectionsGrid);
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
pdDocument.save(byteArrayOutputStream);
@ -115,15 +74,16 @@ public class RedactionController implements RedactionResource {
@Override
public RedactionResult classify(@RequestBody RedactionRequest pdfSegmentationRequest) {
public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(pdfSegmentationRequest.getDocument()))) {
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
return convert(pdDocument, classifiedDoc.getPages().size(), pdfSegmentationRequest.getRuleSetId());
return convert(pdDocument, classifiedDoc.getPages().size());
} catch (IOException e) {
throw new RedactionException(e);
@ -134,14 +94,15 @@ public class RedactionController implements RedactionResource {
@Override
public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
return convert(pdDocument, classifiedDoc.getPages().size(), redactionRequest.getRuleSetId());
return convert(pdDocument, classifiedDoc.getPages().size());
} catch (IOException e) {
throw new RedactionException(e);
@ -153,27 +114,29 @@ public class RedactionController implements RedactionResource {
@Override
public RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest) {
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
Document classifiedDoc;
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
StringBuilder sb = new StringBuilder();
for (Page page : classifiedDoc.getPages()) {
for (AbstractTextContainer textContainer : page.getTextBlocks()) {
if (textContainer instanceof Table) {
Table table = (Table) textContainer;
sb.append(table.getTextAsHtml()).append("<br />").append("<br />");
}
}
}
return RedactionResult.builder().document(sb.toString().getBytes()).build();
} catch (IOException e) {
classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
} catch (Exception e) {
throw new RedactionException(e);
}
StringBuilder sb = new StringBuilder();
for (Page page : classifiedDoc.getPages()) {
for (AbstractTextContainer textContainer : page.getTextBlocks()) {
if (textContainer instanceof Table) {
Table table = (Table) textContainer;
sb.append(table.getTextAsHtml()).append("<br />").append("<br />");
}
}
}
return RedactionResult.builder().document(sb.toString().getBytes()).build();
}
@ -191,23 +154,13 @@ public class RedactionController implements RedactionResource {
}
private RedactionResult convert(PDDocument document, int numberOfPages, String ruleSetId) throws IOException {
return convert(document, numberOfPages, null, null, 0, 0, ruleSetId);
}
private RedactionResult convert(PDDocument document, int numberOfPages,
List<RedactionLogEntry> redactionLogEntities, SectionGrid sectionGrid,
long dictionaryVersion, long rulesVersion, String ruleSetId) throws IOException {
private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException {
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
document.save(byteArrayOutputStream);
return RedactionResult.builder()
.document(byteArrayOutputStream.toByteArray())
.numberOfPages(numberOfPages)
.redactionLog(new RedactionLog(redactionLogEntities, dictionaryVersion, rulesVersion, ruleSetId))
.sectionGrid(sectionGrid)
.build();
}

View File

@ -1,17 +1,15 @@
package com.iqser.red.service.redaction.v1.server.parsing;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
public class PDFAreaTextStripper extends PDFTextStripperByArea {
@Getter
@ -76,7 +74,7 @@ public class PDFAreaTextStripper extends PDFTextStripperByArea {
}
public void clearPositions(){
public void clearPositions() {
textPositionSequences = new ArrayList<>();
}

View File

@ -1,33 +1,15 @@
package com.iqser.red.service.redaction.v1.server.parsing;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
import org.apache.pdfbox.contentstream.operator.color.*;
import org.apache.pdfbox.contentstream.operator.state.*;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
@ -40,40 +22,31 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
@Setter
protected PDPage pdpage;
@Getter
private int minCharWidth;
@Getter
private int maxCharWidth;
@Getter
private int minCharHeight;
@Getter
private int maxCharHeight;
@Getter
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@Getter
private final List<Ruling> rulings = new ArrayList<>();
private final List<Ruling> graphicsPath = new ArrayList<>();
@Setter
protected PDPage pdpage;
@Getter
private int minCharWidth;
@Getter
private int maxCharWidth;
@Getter
private int minCharHeight;
@Getter
private int maxCharHeight;
@Getter
private List<PdfImage> images = new ArrayList<>();
@ -369,4 +342,4 @@ public class PDFLinesTextStripper extends PDFTextStripper {
}
}
}

View File

@ -1,23 +1,20 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.text.TextPosition;
import java.util.ArrayList;
import java.util.List;
@Data
@RequiredArgsConstructor
public class TextPositionSequence implements CharSequence {
private List<TextPosition> textPositions = new ArrayList<>();
private final int page;
private List<TextPosition> textPositions = new ArrayList<>();
public TextPositionSequence(List<TextPosition> textPositions, int page) {
@ -223,4 +220,4 @@ public class TextPositionSequence implements CharSequence {
return new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
}
}
}

View File

@ -1,14 +1,13 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.Iterator;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Value;
import java.util.Iterator;
import java.util.List;
@Value
public class CellValue {
@ -47,4 +46,4 @@ public class CellValue {
.replaceAll(" {2}", " ");
}
}
}

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import lombok.Data;
import lombok.Getter;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.Data;
import lombok.Getter;
@Data
public class Dictionary {
@ -21,15 +21,15 @@ public class Dictionary {
private long version;
public Dictionary(List<DictionaryModel> dictionaryModels, long dictionaryVersion){
public Dictionary(List<DictionaryModel> dictionaryModels, long dictionaryVersion) {
this.dictionaryModels = dictionaryModels;
this.dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), dm));
this.version = dictionaryVersion;
}
public int getDictionaryRank(String type){
if(!localAccessMap.containsKey(type)){
public int getDictionaryRank(String type) {
if (!localAccessMap.containsKey(type)) {
return 0;
}
return localAccessMap.get(type).getRank();
@ -60,7 +60,7 @@ public class Dictionary {
public boolean containsValue(String type, String value) {
if (localAccessMap.containsKey(type) && localAccessMap.get(type)
return localAccessMap.containsKey(type) && localAccessMap.get(type)
.getEntries()
.contains(value) || localAccessMap.containsKey(type) && localAccessMap.get(type)
.getLocalEntries()
@ -68,10 +68,7 @@ public class Dictionary {
.getEntries()
.contains(value) || localAccessMap.containsKey(RECOMMENDATION_PREFIX + type) && localAccessMap.get(RECOMMENDATION_PREFIX + type)
.getLocalEntries()
.contains(value)) {
return true;
}
return false;
.contains(value);
}

View File

@ -1,10 +1,10 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.Set;
@Data
@AllArgsConstructor
public class DictionaryIncrement {

View File

@ -1,15 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.io.Serializable;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class DictionaryModel implements Serializable {
@ -23,8 +22,8 @@ public class DictionaryModel implements Serializable {
private Set<DictionaryEntry> entries;
private Set<String> localEntries;
public Set<String> getValues(boolean local){
return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e-> e.getValue()).collect(Collectors
public Set<String> getValues(boolean local) {
return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e -> e.getValue()).collect(Collectors
.toSet());
}

View File

@ -20,5 +20,4 @@ public class DictionaryRepresentation {
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
}

View File

@ -1,13 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Data;
import lombok.EqualsAndHashCode;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Entity {

View File

@ -1,24 +1,23 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.RequiredArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@RequiredArgsConstructor
@AllArgsConstructor
@EqualsAndHashCode
public class EntityPositionSequence {
private final String id;
@EqualsAndHashCode.Exclude
private List<TextPositionSequence> sequences = new ArrayList<>();
private int pageNumber;
private final String id;
}

View File

@ -1,12 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.awt.geom.Rectangle2D;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.awt.geom.Rectangle2D;
@Data
@Builder
@NoArgsConstructor

View File

@ -1,14 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
@Data
@NoArgsConstructor
@AllArgsConstructor
@ -25,4 +25,4 @@ public class PdfImage {
@NonNull
private int page;
}
}

View File

@ -1,37 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class ReanalysisSection {
private int sectionNumber;
private String headline;
private List<TextBlock> textBlocks;
private Map<String, CellValue> tabularData = new HashMap<>();
private List<Integer> cellStarts;
private Set<Image> images = new HashSet<>();
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> {
if (block instanceof TextBlock) {
searchableText.addAll(block.getSequences());
}
});
return searchableText;
}
}

View File

@ -1,14 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
public class SearchableText {
private final List<TextPositionSequence> sequences = new ArrayList<>();
@ -232,4 +232,4 @@ public class SearchableText {
return sb.append("\n").toString();
}
}
}

View File

@ -1,6 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import java.util.Collection;
import java.util.HashMap;
@ -11,15 +17,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
@Data
@Slf4j

View File

@ -0,0 +1,41 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.file.management.v1.api.model.RedactionChangeLog;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import org.springframework.stereotype.Service;
@Service
public class AnalyzeResponseService {
public AnalyzeResult createAnalyzeResponse(int pageCount, RedactionLog redactionLog, RedactionChangeLog redactionChangeLog) {
boolean hasHints = redactionLog.getRedactionLogEntry().stream().anyMatch(RedactionLogEntry::isHint);
boolean hasRequests = redactionLog.getRedactionLogEntry()
.stream()
.anyMatch(entry -> entry.isManual() && entry.getStatus()
.equals(com.iqser.red.service.redaction.v1.model.Status.REQUESTED));
boolean hasRedactions = redactionLog.getRedactionLogEntry()
.stream()
.anyMatch(entry -> entry.isRedacted() && !entry.isManual() || entry.isManual() && entry.getStatus()
.equals(com.iqser.red.service.redaction.v1.model.Status.APPROVED));
boolean hasImages = redactionLog.getRedactionLogEntry()
.stream()
.anyMatch(entry -> entry.isHint() && entry.getType().equals("image"));
boolean hasUpdates = redactionChangeLog != null && redactionChangeLog.getRedactionLogEntry() != null && !redactionChangeLog
.getRedactionLogEntry()
.isEmpty() && redactionChangeLog.getRedactionLogEntry().stream().anyMatch(entry -> !entry.getType().equals("false_positive"));
return AnalyzeResult.builder()
.numberOfPages(pageCount)
.hasHints(hasHints)
.hasRedactions(hasRedactions)
.hasRequests(hasRequests)
.hasImages(hasImages)
.hasUpdates(hasUpdates).build();
}
}

View File

@ -1,14 +1,7 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.awt.Color;
import java.io.IOException;
import java.util.ArrayList;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.model.*;
import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
@ -21,15 +14,14 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.CellRectangle;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
import lombok.RequiredArgsConstructor;
import java.awt.Color;
import java.io.IOException;
import java.util.ArrayList;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@Service
@RequiredArgsConstructor

View File

@ -1,20 +1,5 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.awt.Color;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
@ -25,10 +10,16 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncre
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation;
import feign.FeignException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.springframework.stereotype.Service;
import java.awt.Color;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
@Service
@ -37,7 +28,7 @@ public class DictionaryService {
private final DictionaryClient dictionaryClient;
private Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>();
private final Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>();
public long updateDictionary(String ruleSetId) {
@ -212,4 +203,4 @@ public class DictionaryService {
return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor();
}
}
}

View File

@ -1,11 +1,10 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import lombok.RequiredArgsConstructor;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.KieServices;
import org.kie.api.builder.KieBuilder;
@ -15,12 +14,11 @@ import org.kie.api.runtime.KieContainer;
import org.kie.api.runtime.KieSession;
import org.springframework.stereotype.Service;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.exception.RulesValidationException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import lombok.RequiredArgsConstructor;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
@Service
@RequiredArgsConstructor
@ -28,9 +26,9 @@ public class DroolsExecutionService {
private final RulesClient rulesClient;
private Map<String, KieContainer> kieContainers = new HashMap<>();
private final Map<String, KieContainer> kieContainers = new HashMap<>();
private Map<String, Long> rulesVersionPerRuleSetId = new HashMap<>();
private final Map<String, Long> rulesVersionPerRuleSetId = new HashMap<>();
public KieContainer getKieContainer(String ruleSetId) {
@ -133,4 +131,4 @@ public class DroolsExecutionService {
return rulesVersion.longValue();
}
}
}

View File

@ -1,50 +1,27 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.classification.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Slf4j
@Service

View File

@ -1,21 +1,18 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import javax.imageio.ImageIO;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import javax.imageio.ImageIO;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@Slf4j
@Service

View File

@ -1,53 +1,29 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.awt.geom.Rectangle2D;
import java.io.ByteArrayInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import lombok.RequiredArgsConstructor;
import java.awt.geom.Rectangle2D;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Slf4j
@Service
@RequiredArgsConstructor
public class ReanalyzeService {
@ -57,13 +33,63 @@ public class ReanalyzeService {
private final SurroundingWordsService surroundingWordsService;
private final EntityRedactionService entityRedactionService;
private final RedactionLogCreatorService redactionLogCreatorService;
private final RedactionStorageService redactionStorageService;
private final PdfSegmentationService pdfSegmentationService;
private final ImageClassificationService imageClassificationService;
private final RedactionChangeLogService redactionChangeLogService;
private final AnalyzeResponseService analyzeResponseService;
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
var pageCount = 0;
Document classifiedDoc;
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
pageCount = pdDocument.getNumberOfPages();
classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
} catch (Exception e) {
throw new RedactionException(e);
}
log.info("Document structure analysis successful, starting redaction analysis...");
imageClassificationService.classifyImages(classifiedDoc);
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
imageClassificationService.classifyImages(classifiedDoc);
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
.getRuleSetId());
log.info("Redaction analysis successful...");
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
var redactionLog = new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion(), classifiedDoc
.getRulesVersion(), analyzeRequest.getRuleSetId());
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), renalyzeRequest
.getRedactionLog()
.getDictionaryVersion());
// first create changelog - this only happens when we migrate files analyzed via the old process and we don't want to loose changeLog data
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
// store redactionLog
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.TEXT, new Text(pageCount, classifiedDoc.getSectionText()));
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc.getSectionGrid());
return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog);
}
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) {
var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
// new procedure was not applied, we need a complete analysis
if (text.getNumberOfPages() == 0) {
return analyze(AnalyzeRequest.builder()
.ruleSetId(renalyzeRequest.getRuleSetId())
.manualRedactions(renalyzeRequest.getManualRedactions())
.projectId(renalyzeRequest.getProjectId())
.fileId(renalyzeRequest.getFileId())
.build());
}
var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
Map<String, List<Comment>> comments = null;
@ -75,21 +101,21 @@ public class ReanalyzeService {
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
}
Set<Integer> sectionsToReanaylse = new HashSet<>();
Set<Integer> sectionsToReanalyse = new HashSet<>();
Map<Integer, Set<Image>> imageEntries = new HashMap<>();
for (RedactionLogEntry entry : renalyzeRequest.getRedactionLog().getRedactionLogEntry()) {
for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) {
if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) {
sectionsToReanaylse.add(entry.getSectionNumber());
sectionsToReanalyse.add(entry.getSectionNumber());
}
if (entry.isImage() || entry.getType().equals("image")) {
imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry));
}
}
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
for (SectionText sectionText : text.getSectionTexts()) {
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
sectionsToReanaylse.add(sectionText.getSectionNumber());
sectionsToReanalyse.add(sectionText.getSectionNumber());
}
if (manualAdds != null) {
@ -106,97 +132,30 @@ public class ReanalyzeService {
}
}
if (sectionsToReanaylse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
}
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(renalyzeRequest.getDocument()))) {
try {
List<ReanalysisSection> reanalysisSections = new ArrayList<>();
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
List<SectionText> reanalysisSections = new ArrayList<>();
for (SectionText sectionText : text.getSectionTexts()) {
if (!sectionsToReanaylse.contains(sectionText.getSectionNumber())) {
continue;
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
reanalysisSections.add(sectionText);
}
ReanalysisSection reanalysisSection = new ReanalysisSection();
reanalysisSection.setHeadline(sectionText.getHeadline());
reanalysisSection.setSectionNumber(sectionText.getSectionNumber());
List<TextBlock> textBlocks = new ArrayList<>();
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
.add(sectionArea);
}
Map<String, CellValue> tabularData = new HashMap<>();
List<Integer> cellStarts = new ArrayList<>();
for (Integer page : sectionAreasPerPage.keySet()) {
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
PDPage pdPage = pdDocument.getPage(page - 1);
PDRectangle cropBox = pdPage.getCropBox();
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
textStripper.setPageNumber(page);
int cellStart = 0;
for (SectionArea sectionArea : areasOnPage) {
Rectangle2D rect = null;
if (pdPage.getRotation() == 90) {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
} else {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
.getHeight() + 0.001f);
}
textStripper.addRegion(String.valueOf(1), rect);
textStripper.extractRegions(pdPage);
textStripper.getTextForRegion(String.valueOf(1));
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
if (sectionText.isTable()) {
Cell cell = new Cell();
cell.addTextBlock(textBlock);
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
cellStarts.add(cellStart);
cellStart = cellStart + cell.toString().trim().length() + 1;
}
textBlocks.add(textBlock);
textStripper.clearPositions();
}
}
reanalysisSection.setTextBlocks(textBlocks);
reanalysisSection.setTabularData(tabularData);
if (sectionText.isTable()) {
reanalysisSection.setCellStarts(cellStarts);
}
if (imageEntries.containsKey(sectionText.getSectionNumber())) {
reanalysisSection.getImages().addAll(imageEntries.get(sectionText.getSectionNumber()));
}
reanalysisSections.add(reanalysisSection);
}
//--
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (ReanalysisSection reanalysisSection : reanalysisSections) {
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
@ -254,7 +213,7 @@ public class ReanalyzeService {
}
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= pdDocument.getNumberOfPages(); page++) {
for (int page = 1; page <= text.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
@ -269,19 +228,14 @@ public class ReanalyzeService {
.getRuleSetId()));
}
Iterator<RedactionLogEntry> itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator();
while (itty.hasNext()) {
RedactionLogEntry entry = itty.next();
if (sectionsToReanaylse.contains(entry.getSectionNumber())) {
itty.remove();
}
}
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
renalyzeRequest.getRedactionLog().getRedactionLogEntry().addAll(newRedactionLogEntries);
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
} catch (Exception e) {
throw new RedactionException(e);

View File

@ -0,0 +1,94 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.file.management.v1.api.model.ChangeType;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.file.management.v1.api.model.RedactionChangeLog;
import com.iqser.red.service.file.management.v1.api.model.RedactionChangeLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@Slf4j
@Service
@RequiredArgsConstructor
public class RedactionChangeLogService {
private final RedactionStorageService storageStorageService;
private final ObjectMapper objectMapper;
public RedactionChangeLog createAndStoreChangeLog(String projectId, String fileId, RedactionLog currentRedactionLog) {
try {
RedactionLog previousRedactionLog = storageStorageService.getRedactionLog(projectId, fileId);
var changeLog = createChangeLog(currentRedactionLog, previousRedactionLog);
storageStorageService.storeObject(projectId, fileId, FileType.REDACTION_CHANGELOG, objectMapper.writeValueAsBytes(changeLog));
return changeLog;
} catch (Exception e) {
log.debug("Previous redaction log not available");
return null;
}
}
private RedactionChangeLog createChangeLog(RedactionLog currentRedactionLog, RedactionLog previousRedactionLog) {
if (previousRedactionLog == null) {
return null;
}
List<RedactionLogEntry> added = new ArrayList<>(currentRedactionLog.getRedactionLogEntry());
added.removeAll(previousRedactionLog.getRedactionLogEntry());
List<RedactionLogEntry> removed = new ArrayList<>(previousRedactionLog.getRedactionLogEntry());
removed.removeAll(currentRedactionLog.getRedactionLogEntry());
List<RedactionChangeLogEntry> changeLogEntries = added.stream()
.map(entry -> convert(entry, ChangeType.ADDED))
.collect(Collectors.toList());
changeLogEntries.addAll(removed.stream()
.map(entry -> convert(entry, ChangeType.REMOVED))
.collect(Collectors.toList()));
return new RedactionChangeLog(changeLogEntries, currentRedactionLog.getDictionaryVersion(), currentRedactionLog.getRulesVersion(), currentRedactionLog
.getRuleSetId(), currentRedactionLog.getFilename());
}
private RedactionChangeLogEntry convert(RedactionLogEntry entry, ChangeType changeType) {
return RedactionChangeLogEntry.builder()
.id(entry.getId())
.type(entry.getType())
.value(entry.getValue())
.reason(entry.getReason())
.matchedRule(entry.getMatchedRule())
.legalBasis(entry.getLegalBasis())
.redacted(entry.isRedacted())
.isHint(entry.isHint())
.isRecommendation(entry.isRecommendation())
.section(entry.getSection())
.color(entry.getColor())
.positions(entry.getPositions())
.sectionNumber(entry.getSectionNumber())
.manual(entry.isManual())
.status(entry.getStatus())
.manualRedactionType(entry.getManualRedactionType())
.isDictionaryEntry(entry.isDictionaryEntry())
.textBefore(entry.getTextBefore())
.textAfter(entry.getTextAfter())
.comments(entry.getComments())
.changeType(changeType)
.build();
}
}

View File

@ -1,28 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.CellRectangle;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactionType;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
import com.iqser.red.service.redaction.v1.model.Status;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
@ -34,8 +12,17 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@Service
@RequiredArgsConstructor

View File

@ -1,25 +1,17 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Slf4j
@UtilityClass
@SuppressWarnings("PMD")
@ -46,7 +38,7 @@ public class EntitySearchUtils {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if(value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())){
if (value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())) {
return true;
}
}
@ -147,16 +139,16 @@ public class EntitySearchUtils {
public void addEntitiesWithHigherRank(Set<Entity> entities, Entity found, Dictionary dictionary) {
if(entities.contains(found)){
if (entities.contains(found)) {
Entity existing = entities.stream().filter(entity -> entity.equals(found)).findFirst().get();
if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())){
if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) {
entities.remove(found);
}
}
entities.add(found);
}
public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found){
public void addEntitiesIgnoreRank(Set<Entity> entities, Set<Entity> found) {
// HashSet keeps old value but we want the new.
entities.removeAll(found);
entities.addAll(found);

View File

@ -1,15 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.List;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.experimental.UtilityClass;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.List;
@UtilityClass
public class IdBuilder {
@ -26,7 +25,7 @@ public class IdBuilder {
}
public String buildId(Rectangle2D rectangle2D, int page){
public String buildId(Rectangle2D rectangle2D, int page) {
StringBuilder sb = new StringBuilder();
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
@ -35,5 +34,4 @@ public class IdBuilder {
}
}

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import lombok.experimental.UtilityClass;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
@ -8,8 +10,6 @@ import java.nio.charset.StandardCharsets;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ResourceLoader {
@ -27,4 +27,4 @@ public class ResourceLoader {
}
}
}

View File

@ -7,6 +7,7 @@ public class TextNormalizationUtilities {
/**
* Revert hyphenation due to line breaks.
*
* @param text Text to be processed.
* @return Text without line-break hyphenation.
*/
@ -14,4 +15,4 @@ public class TextNormalizationUtilities {
return text.replaceAll("([^\\s\\d\\-]{2,})[\\-\\u00AD]\\R|\n\r(.+ )", "$1$2");
}
}
}

View File

@ -1,28 +1,36 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Slf4j
@Service
@ -36,6 +44,82 @@ public class PdfSegmentationService {
private final SectionsBuilderService sectionsBuilderService;
private final RedactionStorageService redactionStorageService;
private void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
try {
for (SectionText sectionText : texts) {
List<TextBlock> textBlocks = new ArrayList<>();
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
.add(sectionArea);
}
Map<String, CellValue> tabularData = new HashMap<>();
List<Integer> cellStarts = new ArrayList<>();
for (Integer page : sectionAreasPerPage.keySet()) {
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
PDPage pdPage = pdDocument.getPage(page - 1);
PDRectangle cropBox = pdPage.getCropBox();
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
textStripper.setPageNumber(page);
int cellStart = 0;
for (SectionArea sectionArea : areasOnPage) {
Rectangle2D rect = null;
if (pdPage.getRotation() == 90) {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
} else {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
.getHeight() + 0.001f);
}
textStripper.addRegion(String.valueOf(1), rect);
textStripper.extractRegions(pdPage);
textStripper.getTextForRegion(String.valueOf(1));
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
if (sectionText.isTable()) {
Cell cell = new Cell();
cell.addTextBlock(textBlock);
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
cellStarts.add(cellStart);
cellStart = cellStart + cell.toString().trim().length() + 1;
}
textBlocks.add(textBlock);
textStripper.clearPositions();
}
}
sectionText.setTextBlocks(textBlocks);
sectionText.setTabularData(tabularData);
if (sectionText.isTable()) {
sectionText.setCellStarts(cellStarts);
}
}
} catch (Exception e) {
throw new RedactionException(e);
}
}
public Document parseDocument(PDDocument pdDocument) throws IOException {
Document document = new Document();
@ -82,6 +166,9 @@ public class PdfSegmentationService {
sectionsBuilderService.buildSections(document);
sectionsBuilderService.addImagesToSections(document);
// This can be improved an done in one pass, but it's complicated to do right away
postProcessSections(pdDocument, document.getSectionText());
return document;
}
@ -116,4 +203,4 @@ public class PdfSegmentationService {
}
}
}

View File

@ -1,29 +1,15 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.classification.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.stream.Collectors;
@Service
public class SectionsBuilderService {
@ -302,4 +288,4 @@ public class SectionsBuilderService {
}
}
}

View File

@ -1,17 +1,16 @@
package com.iqser.red.service.redaction.v1.server.settings;
import org.springframework.boot.context.properties.ConfigurationProperties;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
@Data
@ConfigurationProperties("redaction-service")
public class RedactionServiceSettings {
private int numberOfSurroundingWords = 3;
private int surroundingWordsOffsetWindow = 100;
private boolean enableImageClassification = true;
}
}

View File

@ -0,0 +1,109 @@
package com.iqser.red.service.redaction.v1.server.storage;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.core.io.InputStreamResource;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.io.InputStream;
@Slf4j
@Service
@RequiredArgsConstructor
public class RedactionStorageService {
private final ObjectMapper objectMapper;
private final StorageService storageService;
@SneakyThrows
public InputStream getStoredObject(String storageId) {
return storageService.getObject(storageId).getInputStream();
}
@SneakyThrows
public void storeObject(String projectId, String fileId, FileType fileType, Object any) {
storageService.storeObject(StorageIdUtils.getStorageId(projectId, fileId, fileType), objectMapper.writeValueAsBytes(any));
}
public RedactionLog getRedactionLog(String projectId, String fileId) {
InputStreamResource inputStreamResource;
try {
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.REDACTION_LOG));
} catch (StorageObjectDoesNotExist e) {
log.debug("Text not available.");
return null;
}
try {
return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class);
} catch (IOException e) {
throw new RuntimeException("Could not convert Text", e);
}
}
public Text getText(String projectId, String fileId) {
InputStreamResource inputStreamResource;
try {
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.TEXT));
} catch (StorageObjectDoesNotExist e) {
log.debug("Text not available.");
return null;
}
try {
return objectMapper.readValue(inputStreamResource.getInputStream(), Text.class);
} catch (IOException e) {
throw new RuntimeException("Could not convert Text", e);
}
}
public SectionGrid getSectionGrid(String projectId, String fileId) {
var sectionGrid = storageService.getObject(StorageIdUtils.getStorageId(projectId, fileId, FileType.SECTION_GRID));
try {
return objectMapper.readValue(sectionGrid.getInputStream(), SectionGrid.class);
} catch (IOException e) {
throw new RuntimeException("Could not convert RedactionLog", e);
}
}
@RequiredArgsConstructor
public enum StorageType {
PARSED_DOCUMENT(".json");
@Getter
private final String extension;
}
public static class StorageIdUtils {
public static String getStorageId(String projectId, String fileId, FileType fileType) {
return projectId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
}
public static String getStorageId(String userId, String projectId, String filename) {
return userId + "/" + projectId + "/" + filename;
}
}
}

View File

@ -1,7 +1,6 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -25,7 +24,7 @@ public abstract class AbstractTextContainer {
}
public boolean contains(Rectangle other) {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
public float getHeight() {
@ -36,4 +35,4 @@ public abstract class AbstractTextContainer {
return maxX - minX;
}
}
}

View File

@ -1,18 +1,17 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@SuppressWarnings("serial")
@Data
@EqualsAndHashCode(callSuper = true)
@ -71,7 +70,4 @@ public class Cell extends Rectangle {
}
}
}

View File

@ -1,10 +1,10 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import java.util.List;
import lombok.Builder;
import lombok.Data;
import java.util.List;
@Data
@Builder
public class CleanRulings {

View File

@ -8,170 +8,171 @@ import java.util.List;
@SuppressWarnings("all")
public class Rectangle extends Rectangle2D.Float {
/**
* Ill-defined comparator, from when Rectangle was Comparable.
*
* see https://github.com/tabulapdf/tabula-java/issues/116
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) return 0;
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
? - java.lang.Double.compare(o1.getX(), o2.getX())
: java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
/**
* Ill-defined comparator, from when Rectangle was Comparable.
* <p>
* see https://github.com/tabulapdf/tabula-java/issues/116
*
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override
public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) return 0;
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
? -java.lang.Double.compare(o1.getX(), o2.getX())
: java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
public Rectangle() {
super();
}
public Rectangle() {
super();
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
public float getArea() {
return this.width * this.height;
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float getArea() {
return this.width * this.height;
}
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
&& other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
&& this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
&& other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
&& this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
return rv;
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
}
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
&& other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
&& this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
&& other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
&& this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0,
Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0,
Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
return rv;
return (float) (intersectionArea / unionArea);
}
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0,
Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0,
Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
public float getTop() {
return (float) this.getMinY();
}
return (float) (intersectionArea / unionArea);
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float getRight() {
return (float) this.getMaxX();
}
public float getTop() {
return (float) this.getMinY();
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public float getLeft() {
return (float) this.getMinX();
}
public float getRight() {
return (float) this.getMaxX();
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public float getBottom() {
return (float) this.getMaxY();
}
public float getLeft() {
return (float) this.getMinX();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public Point2D[] getPoints() {
return new Point2D[] { new Point2D.Float(this.getLeft(), this.getTop()),
new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
new Point2D.Float(this.getLeft(), this.getBottom()) };
}
public float getBottom() {
return (float) this.getMaxY();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
public Point2D[] getPoints() {
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()),
new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
new Point2D.Float(this.getLeft(), this.getBottom())};
}
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
}

View File

@ -1,12 +1,11 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import org.locationtech.jts.geom.Envelope;
import org.locationtech.jts.index.strtree.STRtree;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import java.util.ArrayList;
import java.util.List;
@SuppressWarnings("all")
public class RectangleSpatialIndex<T extends Rectangle> {

View File

@ -1,20 +1,13 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.extern.slf4j.Slf4j;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.extern.slf4j.Slf4j;
import java.util.*;
@Slf4j
@ -23,13 +16,127 @@ public class Ruling extends Line2D.Float {
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
private enum SOType {VERTICAL, HRIGHT, HLEFT}
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
}
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
ArrayList<Ruling> rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
}
}
return rv;
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (Utils.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i,
new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
}
public boolean vertical() {
return this.length() > 0 && Utils.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
@ -38,13 +145,13 @@ public class Ruling extends Line2D.Float {
return this.length() > 0 && Utils.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
// attributes that make sense only for non-oblique lines
// these are used to have a single collapse method (in page, currently)
public boolean oblique() {
return !(this.vertical() || this.horizontal());
}
// attributes that make sense only for non-oblique lines
// these are used to have a single collapse method (in page, currently)
public float getPosition() {
if (this.oblique()) {
throw new UnsupportedOperationException();
@ -52,7 +159,6 @@ public class Ruling extends Line2D.Float {
return this.vertical() ? this.getLeft() : this.getTop();
}
public float getStart() {
if (this.oblique()) {
throw new UnsupportedOperationException();
@ -102,12 +208,10 @@ public class Ruling extends Line2D.Float {
}
}
public boolean perpendicularTo(Ruling other) {
return this.vertical() == other.horizontal();
}
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
if (this.intersectsLine(another)) {
return true;
@ -238,7 +342,6 @@ public class Ruling extends Line2D.Float {
return angle;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
@ -248,122 +351,7 @@ public class Ruling extends Line2D.Float {
return rv;
}
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
ArrayList<Ruling> rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
}
}
return rv;
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (Utils.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i,
new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
} catch(UnsupportedOperationException e){
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
}
private enum SOType {VERTICAL, HRIGHT, HLEFT}
}

View File

@ -1,22 +1,13 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.collections4.CollectionUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import java.util.*;
@Slf4j
public class Table extends AbstractTextContainer {
@ -24,21 +15,14 @@ public class Table extends AbstractTextContainer {
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
private final RectangleSpatialIndex<Cell> si = new RectangleSpatialIndex<>();
private final int rotation;
@Getter
@Setter
private String headline;
private int unrotatedRowCount;
private int unrotatedColCount;
private int rowCount = -1;
private int colCount = -1;
private final int rotation;
private List<List<Cell>> rows;
@ -62,8 +46,8 @@ public class Table extends AbstractTextContainer {
// Ignore rows that does not contain any cells and values.
List<List<Cell>> rowsToRemove = new ArrayList<>();
for (List<Cell> row: rows){
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()){
for (List<Cell> row : rows) {
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
rowsToRemove.add(row);
}
}
@ -110,7 +94,7 @@ public class Table extends AbstractTextContainer {
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<Cell> rowCells = rows.get(rowIndex);
if(rowCells.size() == 1){
if (rowCells.size() == 1) {
continue;
}
@ -275,7 +259,7 @@ public class Table extends AbstractTextContainer {
cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2),
Utils.round(arg1
.getBottom(), 2))));
.getBottom(), 2))));
Iterator<Cell> iter = cells.iterator();
Cell c = iter.next();
@ -367,4 +351,4 @@ public class Table extends AbstractTextContainer {
return sb.toString();
}
}
}

View File

@ -1,19 +1,13 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import org.springframework.stereotype.Service;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.util.*;
@Service
public class RulingCleaningService {

View File

@ -1,31 +1,57 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.*;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import org.springframework.stereotype.Service;
import java.awt.geom.Point2D;
import java.util.*;
import java.util.stream.Collectors;
@Service
public class TableExtractionService {
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
} else if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
};
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
} else if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
}
return rv;
};
public void extractTables(CleanRulings cleanRulings, Page page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
@ -80,7 +106,6 @@ public class TableExtractionService {
page.getTextBlocks().removeAll(toBeRemoved);
}
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<Cell> cellsFound = new ArrayList<>();
@ -133,7 +158,6 @@ public class TableExtractionService {
return cellsFound;
}
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<>();
@ -233,47 +257,6 @@ public class TableExtractionService {
return rectangles;
}
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
} else if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
};
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
} else if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
}
return rv;
};
private enum Direction {
HORIZONTAL, VERTICAL
}

View File

@ -19,21 +19,24 @@ import java.awt.geom.Rectangle2D;
* clipping algorithm (line against clip rectangle).
*/
@SuppressWarnings("all")
public final class CohenSutherlandClipping
{
public final class CohenSutherlandClipping {
private static final int INSIDE = 0;
private static final int LEFT = 1;
private static final int RIGHT = 2;
private static final int BOTTOM = 4;
private static final int TOP = 8;
private double xMin;
private double yMin;
private double xMax;
private double yMax;
/**
* Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0).
*/
public CohenSutherlandClipping() {
}
/**
* Creates a Cohen Sutherland clipper with the given clip rectangle.
*
* @param clip the clip rectangle to use
*/
public CohenSutherlandClipping(Rectangle2D clip) {
@ -42,6 +45,7 @@ public final class CohenSutherlandClipping
/**
* Sets the clip rectangle.
*
* @param clip the clip rectangle
*/
public void setClip(Rectangle2D clip) {
@ -51,19 +55,13 @@ public final class CohenSutherlandClipping
yMax = yMin + clip.getHeight();
}
private static final int INSIDE = 0;
private static final int LEFT = 1;
private static final int RIGHT = 2;
private static final int BOTTOM = 4;
private static final int TOP = 8;
private final int regionCode(double x, double y) {
int code = x < xMin
? LEFT
: x > xMax
int code = x < xMin
? LEFT
: x > xMax
? RIGHT
: INSIDE;
if (y < yMin) code |= BOTTOM;
if (y < yMin) code |= BOTTOM;
else if (y > yMax) code |= TOP;
return code;
}
@ -71,6 +69,7 @@ public final class CohenSutherlandClipping
/**
* Clips a given line against the clip rectangle.
* The modification (if needed) is done in place.
*
* @param line the line to clip
* @return true if line is clipped, false if line is
* totally outside the clip rect.
@ -87,9 +86,9 @@ public final class CohenSutherlandClipping
boolean vertical = p1x == p2x;
double slope = vertical
? 0d
: (p2y-p1y)/(p2x-p1x);
double slope = vertical
? 0d
: (p2y - p1y) / (p2x - p1x);
int c1 = regionCode(p1x, p1y);
int c2 = regionCode(p2x, p2y);
@ -103,31 +102,27 @@ public final class CohenSutherlandClipping
if ((c & LEFT) != INSIDE) {
qx = xMin;
qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
}
else if ((c & RIGHT) != INSIDE) {
qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
} else if ((c & RIGHT) != INSIDE) {
qx = xMax;
qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
}
else if ((c & BOTTOM) != INSIDE) {
qy = (Utils.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
} else if ((c & BOTTOM) != INSIDE) {
qy = yMin;
qx = vertical
? p1x
: (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
}
else if ((c & TOP) != INSIDE) {
? p1x
: (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
} else if ((c & TOP) != INSIDE) {
qy = yMax;
qx = vertical
? p1x
: (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
? p1x
: (Utils.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
}
if (c == c1) {
p1x = qx;
p1y = qy;
c1 = regionCode(p1x, p1y);
}
else {
c1 = regionCode(p1x, p1y);
} else {
p2x = qx;
p2y = qy;
c2 = regionCode(p2x, p2y);
@ -137,4 +132,4 @@ public final class CohenSutherlandClipping
return true;
}
}
// end of file
// end of file

View File

@ -10,11 +10,6 @@ import java.util.List;
*/
public final class QuickSort {
private QuickSort() {
}
private static final Comparator<? extends Comparable> OBJCOMP = new Comparator<Comparable>() {
@Override
public int compare(Comparable object1, Comparable object2) {
@ -24,6 +19,10 @@ public final class QuickSort {
};
private QuickSort() {
}
/**
* Sorts the given list using the given comparator.
*

View File

@ -1,11 +1,11 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
import lombok.extern.slf4j.Slf4j;
import java.math.BigDecimal;
import java.util.Comparator;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@SuppressWarnings("all")
public class Utils {

View File

@ -1,15 +1,5 @@
package com.iqser.red.service.redaction.v1.server.visualization.service;
import java.awt.Color;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
@ -17,9 +7,17 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.springframework.stereotype.Service;
import java.awt.Color;
import java.io.IOException;
import java.util.List;
@Slf4j
@Service
@ -34,7 +32,7 @@ public class PdfVisualisationService {
PDPage pdPage = document.getPage(page - 1);
PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
for(Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
@ -44,10 +42,10 @@ public class PdfVisualisationService {
continue;
}
if (textBlock instanceof TextBlock) {
textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size());
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
visualizeTextBlock((TextBlock) textBlock, contentStream);
} else if (textBlock instanceof Table) {
textBlock.setClassification((i+1) + "/" + paragraph.getPageBlocks().size());
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
visualizeTable((Table) textBlock, contentStream);
}
@ -59,7 +57,6 @@ public class PdfVisualisationService {
}
public void visualizeClassifications(Document classifiedDoc, PDDocument document) throws IOException {
for (int page = 1; page <= document.getNumberOfPages(); page++) {

View File

@ -1,4 +1,11 @@
server:
port: 8083
configuration-service.url: "http://localhost:8081"
configuration-service.url: "http://localhost:8081"
storage:
bucket-name: 'redaction'
endpoint: 'http://localhost:9000'
key: minioadmin
secret: minioadmin

View File

@ -17,4 +17,11 @@ management:
prometheus.enabled: ${monitoring.enabled:false}
health.enabled: true
endpoints.web.exposure.include: prometheus, health
metrics.export.prometheus.enabled: ${monitoring.enabled:false}
metrics.export.prometheus.enabled: ${monitoring.enabled:false}
storage:
signer-type: 'AWSS3V4SignerType'
bucket-name: 'redaction'
region: 'us-east-1'
endpoint: 'https://s3.amazonaws.com'

View File

@ -0,0 +1,34 @@
package com.iqser.red.service.redaction.v1.server;
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
import com.iqser.red.storage.commons.service.StorageService;
import org.springframework.core.io.InputStreamResource;
import java.io.ByteArrayInputStream;
import java.util.HashMap;
import java.util.Map;
public class InMemoryStorageService extends StorageService {
private Map<String, byte[]> dataMap = new HashMap<>();
public InMemoryStorageService() {
super(null, null);
}
@Override
public InputStreamResource getObject(String objectId) {
var res = dataMap.get(objectId);
if (res == null) {
throw new StorageObjectDoesNotExist(new RuntimeException());
}
return new InputStreamResource(new ByteArrayInputStream(res));
}
@Override
public void storeObject(String objectId, byte[] data) {
dataMap.put(objectId, data);
}
}

View File

@ -1,28 +1,20 @@
package com.iqser.red.service.redaction.v1.server;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
import com.amazonaws.services.s3.AmazonS3;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.configuration.v1.api.model.*;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows;
import org.apache.commons.io.IOUtils;
import org.junit.Before;
import org.junit.Test;
@ -37,40 +29,20 @@ import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.context.TestConfiguration;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.model.Status;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import java.io.*;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.*;
import java.util.stream.Collectors;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT;
@RunWith(SpringRunner.class)
@SpringBootTest(webEnvironment = RANDOM_PORT)
@ -116,6 +88,15 @@ public class RedactionIntegrationTest {
@MockBean
private ImageClassificationClient imageClassificationClient;
@Autowired
private RedactionStorageService redactionStorageService;
@Autowired
private StorageService storageService;
@MockBean
private AmazonS3 amazonS3;
private final Map<String, List<String>> dictionary = new HashMap<>();
private final Map<String, String> typeColorMap = new HashMap<>();
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
@ -126,6 +107,8 @@ public class RedactionIntegrationTest {
private final Map<String, Long> reanlysisVersions = new HashMap<>();
private final static String TEST_RULESET_ID = "123";
private final static String TEST_PROJECT_ID = "123";
private final static String TEST_FILE_ID = "123";
@TestConfiguration
public static class RedactionIntegrationTestConfiguration {
@ -146,6 +129,12 @@ public class RedactionIntegrationTest {
return kieServices.newKieContainer(kieModule.getReleaseId());
}
@Bean
@Primary
public StorageService inmemoryStorage() {
return new InMemoryStorageService();
}
}
@ -464,15 +453,16 @@ public class RedactionIntegrationTest {
input.addAll(getPathsRecursively(file));
}
for (File path : input) {
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(new FileInputStream(path)))
.build();
AnalyzeRequest request = prepareStorage(new FileInputStream((path)));
System.out.println("Redacting file : " + path.getName());
AnalyzeResult result = redactionController.analyze(request);
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> {
duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry);
});
@ -484,13 +474,7 @@ public class RedactionIntegrationTest {
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L);
long rstart = System.currentTimeMillis();
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
.redactionLog(result.getRedactionLog())
.document(IOUtils.toByteArray(new FileInputStream(path)))
.manualRedactions(null)
.text(result.getText())
.ruleSetId(TEST_RULESET_ID)
.build());
redactionController.reanalyze(request);
long rend = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (rend - rstart));
@ -528,15 +512,14 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = redactionController.analyze(request);
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
var text = redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> {
if (entry.isImage()) {
System.out.println("---->" + entry.getType());
}
@ -547,13 +530,13 @@ public class RedactionIntegrationTest {
System.out.println("first analysis duration: " + (end - start));
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) {
fileOutputStream.write(objectMapper.writeValueAsBytes(result.getText()));
fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_PROJECT_ID, TEST_FILE_ID)));
}
int correctFound = 0;
loop:
for (RedactionLogEntry redactionLogEntry : result.getRedactionLog().getRedactionLogEntry()) {
for (SectionText sectionText : result.getText().getSectionTexts()) {
for (RedactionLogEntry redactionLogEntry : redactionLog.getRedactionLogEntry()) {
for (SectionText sectionText : text.getSectionTexts()) {
if (redactionLogEntry.isImage()) {
correctFound++;
continue loop;
@ -569,7 +552,7 @@ public class RedactionIntegrationTest {
}
}
}
assertThat(correctFound).isEqualTo(result.getRedactionLog().getRedactionLogEntry().size());
assertThat(correctFound).isEqualTo(redactionLog.getRedactionLogEntry().size());
dictionary.get(AUTHOR).add("properties");
reanlysisVersions.put("properties", 1L);
@ -585,20 +568,14 @@ public class RedactionIntegrationTest {
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(VERTEBRATE));
start = System.currentTimeMillis();
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
.redactionLog(result.getRedactionLog())
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.text(result.getText())
.ruleSetId(TEST_RULESET_ID)
.build());
AnalyzeResult reanalyzeResult = redactionController.reanalyze(request);
end = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (end - start));
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(reanalyzeResult.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
@ -613,19 +590,13 @@ public class RedactionIntegrationTest {
System.out.println("testTableRedaction");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
AnalyzeResult result = redactionController.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(result.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
@ -680,12 +651,9 @@ public class RedactionIntegrationTest {
// manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.manualRedactions(manualRedactions)
.build();
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setManualRedactions(manualRedactions);
AnalyzeResult result = redactionController.analyze(request);
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
@ -694,20 +662,15 @@ public class RedactionIntegrationTest {
.status(Status.APPROVED)
.build()));
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
.redactionLog(result.getRedactionLog())
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.manualRedactions(manualRedactions)
.text(result.getText())
.ruleSetId(TEST_RULESET_ID)
.build());
redactionController.reanalyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(reanalyzeResult.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
@ -724,11 +687,16 @@ public class RedactionIntegrationTest {
System.out.println("classificationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId())
.fileId(request.getFileId())
.ruleSetId(request.getRuleSetId())
.build();
RedactionResult result = redactionController.classify(request);
RedactionResult result = redactionController.classify(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Classified.pdf")) {
fileOutputStream.write(result.getDocument());
@ -742,11 +710,15 @@ public class RedactionIntegrationTest {
System.out.println("sectionsTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId())
.fileId(request.getFileId())
.ruleSetId(request.getRuleSetId())
.build();
RedactionResult result = redactionController.sections(request);
RedactionResult result = redactionController.sections(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Sections.pdf")) {
fileOutputStream.write(result.getDocument());
@ -760,11 +732,15 @@ public class RedactionIntegrationTest {
System.out.println("htmlTablesTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId())
.fileId(request.getFileId())
.ruleSetId(request.getRuleSetId())
.build();
RedactionResult result = redactionController.htmlTables(request);
RedactionResult result = redactionController.htmlTables(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) {
fileOutputStream.write(result.getDocument());
@ -778,11 +754,15 @@ public class RedactionIntegrationTest {
System.out.println("htmlTableRotationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
RedactionRequest redactionRequest = RedactionRequest.builder()
.projectId(request.getProjectId())
.fileId(request.getFileId())
.ruleSetId(request.getRuleSetId())
.build();
RedactionResult result = redactionController.htmlTables(request);
RedactionResult result = redactionController.htmlTables(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Tables.html")) {
fileOutputStream.write(result.getDocument());
@ -795,20 +775,45 @@ public class RedactionIntegrationTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Phantom Cells.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = redactionController.analyze(request);
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> {
if (!entry.isHint()) {
assertThat(entry.getReason()).isEqualTo("Not redacted because row is not a vertebrate study");
}
});
}
@SneakyThrows
private AnalyzeRequest prepareStorage(String file) {
ClassPathResource pdfFileResource = new ClassPathResource(file);
return prepareStorage(pdfFileResource.getInputStream());
}
@SneakyThrows
private AnalyzeRequest prepareStorage(InputStream stream) {
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.lastProcessed(OffsetDateTime.now())
.build();
var bytes = IOUtils.toByteArray(stream);
storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_PROJECT_ID, TEST_FILE_ID, FileType.ORIGIN), bytes);
return request;
}
@Test
public void sponsorCompanyTest() throws IOException {
@ -816,17 +821,14 @@ public class RedactionIntegrationTest {
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/sponsor_companies.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = redactionController.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(result.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
@ -857,4 +859,4 @@ public class RedactionIntegrationTest {
}
}
}
}

View File

@ -1,12 +1,8 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.amazonaws.services.s3.AmazonS3;
import com.iqser.red.service.configuration.v1.api.model.*;
import com.iqser.red.service.redaction.v1.server.InMemoryStorageService;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
@ -14,7 +10,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import org.apache.commons.io.IOUtils;
import com.iqser.red.storage.commons.service.StorageService;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.Before;
import org.junit.Ignore;
@ -30,6 +26,7 @@ import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.context.TestConfiguration;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
@ -40,15 +37,8 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
@ -80,6 +70,9 @@ public class EntityRedactionServiceTest {
@Autowired
private DroolsExecutionService droolsExecutionService;
@MockBean
private AmazonS3 amazonS3;
private final static String TEST_RULESET_ID = "123";
@TestConfiguration
@ -101,6 +94,13 @@ public class EntityRedactionServiceTest {
return kieServices.newKieContainer(kieModule.getReleaseId());
}
@Bean
@Primary
public StorageService inmemoryStorage() {
return new InMemoryStorageService();
}
}
@ -125,10 +125,6 @@ public class EntityRedactionServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
RedactionRequest redactionRequest = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
@ -144,7 +140,7 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
@ -158,10 +154,6 @@ public class EntityRedactionServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/nested_redaction.pdf");
RedactionRequest redactionRequest = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
@ -176,7 +168,7 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
@ -526,4 +518,4 @@ public class EntityRedactionServiceTest {
return dictionaryEntries;
}
}
}

View File

@ -1,16 +1,14 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import javax.imageio.ImageIO;
import com.amazonaws.services.s3.AmazonS3;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.Ignore;
import org.junit.Test;
@ -22,15 +20,15 @@ import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import javax.imageio.ImageIO;
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import static org.assertj.core.api.Assertions.assertThat;
@SpringBootTest
@RunWith(SpringRunner.class)
@ -51,6 +49,8 @@ public class PdfSegmentationServiceTest {
@MockBean
private KieContainer kieContainer;
@MockBean
private AmazonS3 amazonS3;
@Test
@Ignore
@ -76,6 +76,29 @@ public class PdfSegmentationServiceTest {
}
@Test
public void testPDFSegmentationWithComplexTable() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table table = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
}
}
@Test
public void testTableExtraction() throws IOException {