diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java
index d6d0ea67..e2c7fe08 100644
--- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java
+++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java
@@ -1,7 +1,5 @@
package buildjob;
-import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
-
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
@@ -24,6 +22,8 @@ import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.util.BambooServer;
+import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
+
/**
* Plan configuration for Bamboo.
* Learn more on: https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs
@@ -33,6 +33,8 @@ public class PlanSpec {
private static final String SERVICE_NAME = "redaction-service";
+ private static final String JVM_ARGS =" -Xmx4g -XX:+ExitOnOutOfMemoryError -XX:SurvivorRatio=2 -XX:NewRatio=1 -XX:InitialTenuringThreshold=16 -XX:MaxTenuringThreshold=16 -XX:InitiatingHeapOccupancyPercent=35 ";
+
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", "");
/**
@@ -82,9 +84,12 @@ public class PlanSpec {
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build")
+ .environmentVariables("MAVEN_OPTS="+JVM_ARGS)
.inlineBody("#!/bin/bash\n" +
"set -e\n" +
+ "export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\"\n" +
+
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +
diff --git a/redaction-service-v1/pom.xml b/redaction-service-v1/pom.xml
index 124e5ae4..501ae19c 100644
--- a/redaction-service-v1/pom.xml
+++ b/redaction-service-v1/pom.xml
@@ -32,7 +32,7 @@
com.iqser.red
platform-commons-dependency
- 1.2.9
+ 1.3.0
import
pom
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java
index 61d12a43..b88a16b7 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java
@@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@@ -12,7 +13,7 @@ public class Footer {
private List textBlocks;
-
+ @JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java
index f3067452..133e0245 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java
@@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@@ -12,7 +13,7 @@ public class Header {
private List textBlocks;
-
+ @JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java
index c9c88cec..77649132 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java
@@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
@@ -31,6 +32,12 @@ public class SectionText {
private List cellStarts = new ArrayList<>();
+ public void setTabularData(Map tabularData) {
+ tabularData.remove(null);
+ this.tabularData = tabularData;
+ }
+
+ @JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java
index 6da9f6a0..63cfc11c 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java
@@ -1,11 +1,13 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
+import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@@ -13,6 +15,7 @@ import java.util.List;
@AllArgsConstructor
@Builder
@Data
+@NoArgsConstructor
public class TextBlock extends AbstractTextContainer {
@Builder.Default
@@ -116,6 +119,7 @@ public class TextBlock extends AbstractTextContainer {
}
@Override
+ @JsonIgnore
public String getText() {
StringBuilder sb = new StringBuilder();
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java
index 79277b9e..0d51a4f8 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java
@@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@@ -12,7 +13,7 @@ public class UnclassifiedText {
private List textBlocks;
-
+ @JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java
index 250001a7..63a212b8 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java
@@ -59,7 +59,7 @@ public class RedactionController implements RedactionResource {
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
pdDocument.setAllSecurityToBeRemoved(true);
-
+
dictionaryService.updateDictionary(redactionLog.getRuleSetId());
annotationService.annotate(pdDocument, redactionLog, sectionsGrid);
@@ -131,7 +131,7 @@ public class RedactionController implements RedactionResource {
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
- classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
+ classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true);
} catch (Exception e) {
throw new RedactionException(e);
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java
new file mode 100644
index 00000000..d8e72d22
--- /dev/null
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java
@@ -0,0 +1,52 @@
+package com.iqser.red.service.redaction.v1.server.parsing.model;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+import lombok.SneakyThrows;
+import org.apache.pdfbox.text.TextPosition;
+import org.springframework.beans.BeanUtils;
+
+@Data
+@NoArgsConstructor
+public class RedTextPosition {
+
+ private String textMatrix;
+ private int rotation;
+ private float y;
+ private float pageHeight;
+ private float pageWidth;
+ private String unicode;
+ private float XDirAdj;
+ private float YDirAdj;
+ private float width;
+ private float heightDir;
+
+ // not used in reanalysis
+ @JsonIgnore
+ private float widthOfSpace;
+
+ // not used in reanalysis
+ @JsonIgnore
+ private float fontSizeInPt;
+
+ // not used in reanalysis
+ @JsonIgnore
+ private String fontName;
+
+
+ @SneakyThrows
+ public static RedTextPosition fromTextPosition(TextPosition textPosition) {
+ var pos = new RedTextPosition();
+ BeanUtils.copyProperties(textPosition, pos);
+ pos.setFontName(textPosition.getFont().getName());
+
+ pos.setFontSizeInPt(textPosition.getFontSizeInPt());
+
+ pos.setTextMatrix(textPosition.getTextMatrix().toString());
+
+ return pos;
+ }
+
+
+}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java
index c6181f4e..10b5abb1 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java
@@ -1,29 +1,52 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
-import lombok.RequiredArgsConstructor;
+import lombok.NoArgsConstructor;
import org.apache.pdfbox.text.TextPosition;
import java.util.ArrayList;
import java.util.List;
+import java.util.stream.Collectors;
@Data
-@RequiredArgsConstructor
+@NoArgsConstructor
+@JsonIgnoreProperties({ "empty" })
public class TextPositionSequence implements CharSequence {
- private final int page;
- private List textPositions = new ArrayList<>();
+ private int page;
+ private List textPositions = new ArrayList<>();
+
+ private float x1;
+ private float x2;
+
+ public TextPositionSequence(int page) {
+ this.page = page;
+ }
+
+
+ public static TextPositionSequence fromData(List textPositions, int page) {
+ var textPositionSequence = new TextPositionSequence();
+ textPositionSequence.textPositions = textPositions;
+ textPositionSequence.page = page;
+
+ return textPositionSequence;
+ }
public TextPositionSequence(List textPositions, int page) {
- this.textPositions = textPositions;
+ this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
}
+
+
+
@Override
public int length() {
@@ -34,7 +57,7 @@ public class TextPositionSequence implements CharSequence {
@Override
public char charAt(int index) {
- TextPosition textPosition = textPositionAt(index);
+ RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
@@ -42,7 +65,7 @@ public class TextPositionSequence implements CharSequence {
public char charAt(int index, boolean caseInSensitive) {
- TextPosition textPosition = textPositionAt(index);
+ RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@@ -51,7 +74,7 @@ public class TextPositionSequence implements CharSequence {
@Override
public TextPositionSequence subSequence(int start, int end) {
- return new TextPositionSequence(textPositions.subList(start, end), page);
+ return fromData(textPositions.subList(start, end), page);
}
@@ -66,18 +89,25 @@ public class TextPositionSequence implements CharSequence {
}
- public TextPosition textPositionAt(int index) {
+ public RedTextPosition textPositionAt(int index) {
return textPositions.get(index);
}
- public void add(TextPosition textPosition) {
+ public void add(RedTextPosition textPosition) {
this.textPositions.add(textPosition);
}
+ public void add(TextPosition textPosition) {
+
+ this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
+ }
+
+
+ @JsonIgnore
public float getX1() {
if (textPositions.get(0).getRotation() == 90) {
@@ -88,6 +118,7 @@ public class TextPositionSequence implements CharSequence {
}
+ @JsonIgnore
public float getX2() {
if (textPositions.get(0).getRotation() == 90) {
@@ -98,13 +129,14 @@ public class TextPositionSequence implements CharSequence {
}
}
-
+ @JsonIgnore
public float getRotationAdjustedY() {
return textPositions.get(0).getY();
}
+ @JsonIgnore
public float getY1() {
if (textPositions.get(0).getRotation() == 90) {
@@ -115,6 +147,7 @@ public class TextPositionSequence implements CharSequence {
}
+ @JsonIgnore
public float getY2() {
if (textPositions.get(0).getRotation() == 90) {
@@ -125,38 +158,40 @@ public class TextPositionSequence implements CharSequence {
}
+ @JsonIgnore
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + 2;
}
+ @JsonIgnore
public float getHeight() {
return getY2() - getY1();
}
+ @JsonIgnore
public float getWidth() {
return getX2() - getX1();
}
+ @JsonIgnore
public String getFont() {
-
- return textPositions.get(0)
- .getFont()
- .toString()
+ return textPositions.get(0).getFontName()
.toLowerCase()
.replaceAll(",bold", "")
.replaceAll(",italic", "");
}
+ @JsonIgnore
public String getFontStyle() {
- String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase();
+ String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";
@@ -170,25 +205,25 @@ public class TextPositionSequence implements CharSequence {
}
-
+ @JsonIgnore
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
-
+ @JsonIgnore
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
-
+ @JsonIgnore
public int getRotation() {
return textPositions.get(0).getRotation();
}
-
+ @JsonIgnore
public Rectangle getRectangle() {
float height = getTextHeight();
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java
index 6d65518c..e38c8cf2 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java
@@ -3,19 +3,23 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
-import lombok.Value;
+import lombok.AllArgsConstructor;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
-@Value
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
public class CellValue {
- private List textBlocks;
+ private List textBlocks = new ArrayList<>();
private int rowSpanStart;
-
@Override
public String toString() {
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java
index e4e6167a..766d607d 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java
@@ -5,8 +5,6 @@ import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
-import java.awt.geom.Rectangle2D;
-
@Data
@Builder
@NoArgsConstructor
@@ -14,7 +12,7 @@ import java.awt.geom.Rectangle2D;
public class Image {
private String type;
- private Rectangle2D position;
+ private RedRectangle2D position;
private boolean redaction;
private String redactionReason;
private String legalBasis;
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java
index f7f6ad4f..1631717f 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java
@@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@@ -11,9 +12,10 @@ import java.awt.image.BufferedImage;
@RequiredArgsConstructor
public class PdfImage {
+ @JsonIgnore
private BufferedImage image;
@NonNull
- private Rectangle2D position;
+ private RedRectangle2D position;
private ImageType imageType;
private boolean isAppendedToParagraph;
@@ -22,7 +24,7 @@ public class PdfImage {
public PdfImage(BufferedImage image, Rectangle2D position, int page) {
this.image = image;
- this.position = position;
+ this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight());
this.page = page;
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java
new file mode 100644
index 00000000..601d328c
--- /dev/null
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java
@@ -0,0 +1,35 @@
+package com.iqser.red.service.redaction.v1.server.redaction.model;
+
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import lombok.AllArgsConstructor;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+public class RedRectangle2D {
+
+ private double x;
+ private double y;
+ private double width;
+ private double height;
+
+ @JsonIgnore
+ public boolean isEmpty() {
+ return width <= 0.0f || height <= 0.0f;
+ }
+
+ public boolean contains(double x, double y, double w, double h) {
+ if (isEmpty() || w <= 0 || h <= 0) {
+ return false;
+ }
+ double x0 = getX();
+ double y0 = getY();
+ return x >= x0 &&
+ y >= y0 &&
+ (x + w) <= x0 + getWidth() &&
+ (y + h) <= y0 + getHeight();
+ }
+}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
index 89ccf4a4..5ee4cb3f 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
@@ -187,6 +187,7 @@ public class EntityRedactionService {
.get(0)
.getPage());
sectionText.getSectionAreas().add(sectionArea);
+ sectionText.getTextBlocks().addAll(cell.getTextBlocks());
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
int cellStart = start;
@@ -235,6 +236,8 @@ public class EntityRedactionService {
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
+ sectionText.setTabularData(tabularData);
+ sectionText.setCellStarts(cellStarts);
classifiedDoc.getSectionText().add(sectionText);
}
@@ -267,6 +270,7 @@ public class EntityRedactionService {
.getSequences()
.get(0)
.getPage());
+ sectionText.getTextBlocks().addAll(cell.getTextBlocks());
sectionText.getSectionAreas().add(sectionArea);
}
@@ -325,6 +329,10 @@ public class EntityRedactionService {
sectionText.setHeadline(headline);
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(false);
+ sectionText.setImages(images.stream()
+ .map(image -> convert(image, sectionNumber.intValue(), headline))
+ .collect(Collectors.toSet()));
+ sectionText.setTextBlocks(paragraphTextBlocks);
classifiedDoc.getSectionText().add(sectionText);
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java
index 2ebc57f4..a5bcd4f3 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java
@@ -12,12 +12,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
+import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
-import java.awt.geom.Rectangle2D;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -39,7 +39,6 @@ public class ReanalyzeService {
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
-
var pageCount = 0;
Document classifiedDoc;
@@ -74,30 +73,28 @@ public class ReanalyzeService {
return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog);
}
- public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) {
- var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
- // new procedure was not applied, we need a complete analysis
+
+ @SneakyThrows
+ public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
+
+ var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
+ var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
+
+ // not yet ready for reanalysis
if (text.getNumberOfPages() == 0) {
- return analyze(AnalyzeRequest.builder()
- .ruleSetId(renalyzeRequest.getRuleSetId())
- .manualRedactions(renalyzeRequest.getManualRedactions())
- .projectId(renalyzeRequest.getProjectId())
- .fileId(renalyzeRequest.getFileId())
- .build());
+ return analyze(analyzeRequest);
}
- var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
+ DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
- DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
-
- Set manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
+ Set manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions());
Map> comments = null;
Set manualAdds = null;
- if (renalyzeRequest.getManualRedactions() != null) {
+ if (analyzeRequest.getManualRedactions() != null) {
// TODO comments will be removed from redactionLog, so we ignore this first.
- comments = renalyzeRequest.getManualRedactions().getComments();
- manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
+ comments = analyzeRequest.getManualRedactions().getComments();
+ manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd();
}
Set sectionsToReanalyse = new HashSet<>();
@@ -131,115 +128,114 @@ public class ReanalyzeService {
}
}
+ log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest);
+
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
- var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
- redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
+ var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
+ redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
}
- try {
+ List reanalysisSections = new ArrayList<>();
- List reanalysisSections = new ArrayList<>();
- for (SectionText sectionText : text.getSectionTexts()) {
-
- if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
- reanalysisSections.add(sectionText);
- }
+ for (SectionText sectionText : text.getSectionTexts()) {
+ if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
+ reanalysisSections.add(sectionText);
}
-
- KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
-
- Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
-
- List sectionSearchableTextPairs = new ArrayList<>();
- for (SectionText reanalysisSection : reanalysisSections) {
-
- Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
- .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
- if (reanalysisSection.getCellStarts() != null) {
- surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
- .getCellStarts());
- } else {
- surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
- }
-
- sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
- .isLocal(false)
- .dictionaryTypes(dictionary.getTypes())
- .entities(entities)
- .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
- .searchText(reanalysisSection.getSearchableText().toString())
- .headline(reanalysisSection.getHeadline())
- .sectionNumber(reanalysisSection.getSectionNumber())
- .tabularData(reanalysisSection.getTabularData())
- .searchableText(reanalysisSection.getSearchableText())
- .dictionary(dictionary)
- .images(reanalysisSection.getImages())
- .build(), reanalysisSection.getSearchableText()));
- }
-
- Set entities = new HashSet<>();
- Map> imagesPerPage = new HashMap<>();
- sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
- Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
- .getSection());
- entities.addAll(analysedRowSection.getEntities());
- EntitySearchUtils.removeEntitiesContainedInLarger(entities);
-
- for (Image image : analysedRowSection.getImages()) {
- imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
- }
-
- });
-
- Map> entitiesPerPage = new HashMap<>();
- for (Entity entity : entities) {
- Map> sequenceOnPage = new HashMap<>();
- for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
- sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
- .add(entityPositionSequence);
- }
-
- for (Map.Entry> entry : sequenceOnPage.entrySet()) {
- entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
- .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
- .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
- .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
- .getStart(), entity.getEnd()));
- }
- }
-
- List newRedactionLogEntries = new ArrayList<>();
- for (int page = 1; page <= text.getNumberOfPages(); page++) {
- if (entitiesPerPage.get(page) != null) {
- newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
- .getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
- }
-
- if (imagesPerPage.get(page) != null) {
- newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest
- .getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
- }
-
- newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest
- .getRuleSetId()));
- }
-
- redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
- redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
- redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
-
- var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
- redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
- return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
-
-
- } catch (Exception e) {
- throw new RedactionException(e);
}
+
+ //--
+
+ KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId());
+
+ Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId());
+
+ List sectionSearchableTextPairs = new ArrayList<>();
+ for (SectionText reanalysisSection : reanalysisSections) {
+
+ Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
+ .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
+ if (reanalysisSection.getCellStarts() != null) {
+ surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
+ .getCellStarts());
+ } else {
+ surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
+ }
+
+ sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
+ .isLocal(false)
+ .dictionaryTypes(dictionary.getTypes())
+ .entities(entities)
+ .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
+ .searchText(reanalysisSection.getSearchableText().toString())
+ .headline(reanalysisSection.getHeadline())
+ .sectionNumber(reanalysisSection.getSectionNumber())
+ .tabularData(reanalysisSection.getTabularData())
+ .searchableText(reanalysisSection.getSearchableText())
+ .dictionary(dictionary)
+ .images(reanalysisSection.getImages())
+ .build(), reanalysisSection.getSearchableText()));
+ }
+
+ Set entities = new HashSet<>();
+ Map> imagesPerPage = new HashMap<>();
+ sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
+ Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
+ .getSection());
+ entities.addAll(analysedRowSection.getEntities());
+ EntitySearchUtils.removeEntitiesContainedInLarger(entities);
+
+ for (Image image : analysedRowSection.getImages()) {
+ imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
+ }
+
+ });
+
+ Map> entitiesPerPage = new HashMap<>();
+ for (Entity entity : entities) {
+ Map> sequenceOnPage = new HashMap<>();
+ for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
+ sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
+ .add(entityPositionSequence);
+ }
+
+ for (Map.Entry> entry : sequenceOnPage.entrySet()) {
+ entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
+ .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
+ .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
+ .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
+ .getStart(), entity.getEnd()));
+ }
+ }
+
+ List newRedactionLogEntries = new ArrayList<>();
+ for (int page = 1; page <= text.getNumberOfPages(); page++) {
+ if (entitiesPerPage.get(page) != null) {
+ newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest
+ .getManualRedactions(), page, analyzeRequest.getRuleSetId()));
+ }
+
+ if (imagesPerPage.get(page) != null) {
+ newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest
+ .getManualRedactions(), page, analyzeRequest.getRuleSetId()));
+ }
+
+ newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest
+ .getRuleSetId()));
+ }
+
+
+ redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
+ redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
+ redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
+
+ var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
+ redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
+ return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
+
}
@@ -262,7 +258,7 @@ public class ReanalyzeService {
return Image.builder()
.type(entry.getType())
- .position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft()
+ .position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
.getY(), position.getWidth(), position.getHeight()))
.sectionNumber(entry.getSectionNumber())
.section(entry.getSection())
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java
index a046fb08..49bfe693 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java
@@ -4,6 +4,7 @@ import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
+import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
@@ -14,7 +15,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
-import org.apache.pdfbox.text.TextPosition;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
@@ -272,24 +272,24 @@ public class RedactionLogCreatorService {
}
- private List getRectanglesPerLine(List textPositions, int page) {
+ private List getRectanglesPerLine(List textPositions, int page) {
List rectangles = new ArrayList<>();
if (textPositions.size() == 1) {
- rectangles.add(new TextPositionSequence(textPositions, page).getRectangle());
+ rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle());
} else {
float y = textPositions.get(0).getYDirAdj();
int startIndex = 0;
for (int i = 1; i < textPositions.size(); i++) {
float yDirAdj = textPositions.get(i).getYDirAdj();
if (yDirAdj != y) {
- rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle());
+ rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle());
y = yDirAdj;
startIndex = i;
}
}
if (startIndex != textPositions.size()) {
- rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
+ rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
}
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java
index 34a712fe..241aa1be 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java
@@ -3,9 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
+import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import lombok.experimental.UtilityClass;
-import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.List;
@@ -25,12 +25,8 @@ public class IdBuilder {
}
- public String buildId(Rectangle2D rectangle2D, int page) {
-
- StringBuilder sb = new StringBuilder();
- sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
-
- return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
+ public String buildId(RedRectangle2D rectangle2D, int page) {
+ return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString();
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java
index 2eb06c3d..be4fa972 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java
@@ -1,21 +1,15 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
-import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
-import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
-import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
-import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
-import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
@@ -28,15 +22,12 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
-import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
@Slf4j
@Service
@@ -53,80 +44,11 @@ public class PdfSegmentationService {
private final ImageClassificationService imageClassificationService;
- private void postProcessSections(PDDocument pdDocument, List texts) {
-
- try {
- for (SectionText sectionText : texts) {
-
- List textBlocks = new ArrayList<>();
-
- Map> sectionAreasPerPage = new HashMap<>();
- for (SectionArea sectionArea : sectionText.getSectionAreas()) {
- sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
- .add(sectionArea);
- }
-
- Map tabularData = new HashMap<>();
- List cellStarts = new ArrayList<>();
- for (Integer page : sectionAreasPerPage.keySet()) {
- List areasOnPage = sectionAreasPerPage.get(page);
-
- PDPage pdPage = pdDocument.getPage(page - 1);
- PDRectangle cropBox = pdPage.getCropBox();
- PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
- textStripper.setPageNumber(page);
-
- int cellStart = 0;
- for (SectionArea sectionArea : areasOnPage) {
-
- Rectangle2D rect = null;
- if (pdPage.getRotation() == 90) {
- rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
- .getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
- } else {
- rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
- .getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
- .getHeight() + 0.001f);
- }
-
- textStripper.addRegion(String.valueOf(1), rect);
- textStripper.extractRegions(pdPage);
- textStripper.getTextForRegion(String.valueOf(1));
- List positions = textStripper.getTextPositionSequences();
-
- TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
- .getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
- .getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
-
- if (sectionText.isTable()) {
- Cell cell = new Cell();
- cell.addTextBlock(textBlock);
- tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
- cellStarts.add(cellStart);
- cellStart = cellStart + cell.toString().trim().length() + 1;
- }
-
- textBlocks.add(textBlock);
- textStripper.clearPositions();
- }
-
- }
- sectionText.setTextBlocks(textBlocks);
- sectionText.setTabularData(tabularData);
- if (sectionText.isTable()) {
- sectionText.setCellStarts(cellStarts);
- }
- }
-
-
- } catch (Exception e) {
- throw new RedactionException(e);
- }
-
+ public Document parseDocument(InputStream documentInputStream) throws IOException {
+ return parseDocument(documentInputStream, false);
}
-
- public Document parseDocument(InputStream documentInputStream) throws IOException {
+ public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException {
PDDocument pdDocument = null;
try {
//create tempFile
@@ -166,24 +88,23 @@ public class PdfSegmentationService {
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());
+
page.setRotation(rotation);
-
- tableExtractionService.extractTables(cleanRulings, page);
-
- buildPageStatistics(page);
-
page.setLandscape(isLandscape || isRotated);
-
page.setPageNumber(pageNumber);
- increaseDocumentStatistics(page, document);
-
page.setImages(stripper.getImages());
- imageClassificationService.classifyImages(page);
+ tableExtractionService.extractTables(cleanRulings, page);
+ buildPageStatistics(page);
+ increaseDocumentStatistics(page, document);
+
+
+ if (!ignoreImages) {
+ imageClassificationService.classifyImages(page);
+ }
pages.add(page);
-
}
document.setPages(pages);
@@ -194,9 +115,6 @@ public class PdfSegmentationService {
pdDocument = reinitializePDDocument(tempFile, pdDocument);
- // This can be improved an done in one pass, but it's complicated to do right away
- postProcessSections(pdDocument, document.getSectionText());
-
IOUtils.close(pdDocument);
tempFile.delete();
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java
index c9792c0f..2c96bd05 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java
@@ -50,7 +50,7 @@ public class RedactionStorageService {
try {
return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class);
} catch (IOException e) {
- throw new RuntimeException("Could not convert Text", e);
+ throw new RuntimeException("Could not convert RedactionLog", e);
}
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java
index 2f6183ab..b050e27b 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java
@@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Data;
@@ -27,10 +28,12 @@ public abstract class AbstractTextContainer {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
+ @JsonIgnore
public float getHeight() {
return maxY - minY;
}
-
+
+ @JsonIgnore
public float getWidth() {
return maxX - minX;
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java
similarity index 87%
rename from redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java
rename to redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java
index cff5698f..e37034ce 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java
@@ -12,11 +12,11 @@ import java.io.FileOutputStream;
import java.util.HashMap;
import java.util.Map;
-public class FilySystemBackedStorageService extends StorageService {
+public class FileSystemBackedStorageService extends StorageService {
- private Map dataMap = new HashMap<>();
+ private final Map dataMap = new HashMap<>();
- public FilySystemBackedStorageService() {
+ public FileSystemBackedStorageService() {
super(null, null);
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java
index 2a998b14..c74b653e 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java
@@ -134,7 +134,7 @@ public class RedactionIntegrationTest {
@Bean
@Primary
public StorageService inmemoryStorage() {
- return new FilySystemBackedStorageService();
+ return new FileSystemBackedStorageService();
}
}
@@ -142,8 +142,8 @@ public class RedactionIntegrationTest {
@After
public void cleanupStorage() {
- if (this.storageService instanceof FilySystemBackedStorageService) {
- ((FilySystemBackedStorageService) this.storageService).clearStorage();
+ if (this.storageService instanceof FileSystemBackedStorageService) {
+ ((FileSystemBackedStorageService) this.storageService).clearStorage();
}
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
index 8c19e0d6..32fe65ee 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
@@ -2,7 +2,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.amazonaws.services.s3.AmazonS3;
import com.iqser.red.service.configuration.v1.api.model.*;
-import com.iqser.red.service.redaction.v1.server.FilySystemBackedStorageService;
+import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
@@ -97,7 +97,7 @@ public class EntityRedactionServiceTest {
@Bean
@Primary
public StorageService inmemoryStorage() {
- return new FilySystemBackedStorageService();
+ return new FileSystemBackedStorageService();
}
}