diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index d6d0ea67..e2c7fe08 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -1,7 +1,5 @@ package buildjob; -import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask; - import com.atlassian.bamboo.specs.api.BambooSpec; import com.atlassian.bamboo.specs.api.builders.BambooKey; import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration; @@ -24,6 +22,8 @@ import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger; import com.atlassian.bamboo.specs.model.task.InjectVariablesScope; import com.atlassian.bamboo.specs.util.BambooServer; +import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask; + /** * Plan configuration for Bamboo. * Learn more on: https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs @@ -33,6 +33,8 @@ public class PlanSpec { private static final String SERVICE_NAME = "redaction-service"; + private static final String JVM_ARGS =" -Xmx4g -XX:+ExitOnOutOfMemoryError -XX:SurvivorRatio=2 -XX:NewRatio=1 -XX:InitialTenuringThreshold=16 -XX:MaxTenuringThreshold=16 -XX:InitiatingHeapOccupancyPercent=35 "; + private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", ""); /** @@ -82,9 +84,12 @@ public class PlanSpec { .checkoutItems(new CheckoutItem().defaultRepository()), new ScriptTask() .description("Build") + .environmentVariables("MAVEN_OPTS="+JVM_ARGS) .inlineBody("#!/bin/bash\n" + "set -e\n" + + "export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\"\n" + + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + diff --git a/redaction-service-v1/pom.xml b/redaction-service-v1/pom.xml index 124e5ae4..501ae19c 100644 --- a/redaction-service-v1/pom.xml +++ b/redaction-service-v1/pom.xml @@ -32,7 +32,7 @@ com.iqser.red platform-commons-dependency - 1.2.9 + 1.3.0 import pom diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java index 61d12a43..b88a16b7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +13,7 @@ public class Footer { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java index f3067452..133e0245 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +13,7 @@ public class Header { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java index c9c88cec..77649132 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; import com.iqser.red.service.redaction.v1.server.redaction.model.Image; @@ -31,6 +32,12 @@ public class SectionText { private List cellStarts = new ArrayList<>(); + public void setTabularData(Map tabularData) { + tabularData.remove(null); + this.tabularData = tabularData; + } + + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java index 6da9f6a0..63cfc11c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java @@ -1,11 +1,13 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; +import lombok.NoArgsConstructor; import java.util.ArrayList; import java.util.List; @@ -13,6 +15,7 @@ import java.util.List; @AllArgsConstructor @Builder @Data +@NoArgsConstructor public class TextBlock extends AbstractTextContainer { @Builder.Default @@ -116,6 +119,7 @@ public class TextBlock extends AbstractTextContainer { } @Override + @JsonIgnore public String getText() { StringBuilder sb = new StringBuilder(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java index 79277b9e..0d51a4f8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +13,7 @@ public class UnclassifiedText { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java index 250001a7..63a212b8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java @@ -59,7 +59,7 @@ public class RedactionController implements RedactionResource { try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { pdDocument.setAllSecurityToBeRemoved(true); - + dictionaryService.updateDictionary(redactionLog.getRuleSetId()); annotationService.annotate(pdDocument, redactionLog, sectionsGrid); @@ -131,7 +131,7 @@ public class RedactionController implements RedactionResource { try { var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); - classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); + classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true); } catch (Exception e) { throw new RedactionException(e); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java new file mode 100644 index 00000000..d8e72d22 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java @@ -0,0 +1,52 @@ +package com.iqser.red.service.redaction.v1.server.parsing.model; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.SneakyThrows; +import org.apache.pdfbox.text.TextPosition; +import org.springframework.beans.BeanUtils; + +@Data +@NoArgsConstructor +public class RedTextPosition { + + private String textMatrix; + private int rotation; + private float y; + private float pageHeight; + private float pageWidth; + private String unicode; + private float XDirAdj; + private float YDirAdj; + private float width; + private float heightDir; + + // not used in reanalysis + @JsonIgnore + private float widthOfSpace; + + // not used in reanalysis + @JsonIgnore + private float fontSizeInPt; + + // not used in reanalysis + @JsonIgnore + private String fontName; + + + @SneakyThrows + public static RedTextPosition fromTextPosition(TextPosition textPosition) { + var pos = new RedTextPosition(); + BeanUtils.copyProperties(textPosition, pos); + pos.setFontName(textPosition.getFont().getName()); + + pos.setFontSizeInPt(textPosition.getFontSizeInPt()); + + pos.setTextMatrix(textPosition.getTextMatrix().toString()); + + return pos; + } + + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java index c6181f4e..10b5abb1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java @@ -1,29 +1,52 @@ package com.iqser.red.service.redaction.v1.server.parsing.model; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; import lombok.Data; -import lombok.RequiredArgsConstructor; +import lombok.NoArgsConstructor; import org.apache.pdfbox.text.TextPosition; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; @Data -@RequiredArgsConstructor +@NoArgsConstructor +@JsonIgnoreProperties({ "empty" }) public class TextPositionSequence implements CharSequence { - private final int page; - private List textPositions = new ArrayList<>(); + private int page; + private List textPositions = new ArrayList<>(); + + private float x1; + private float x2; + + public TextPositionSequence(int page) { + this.page = page; + } + + + public static TextPositionSequence fromData(List textPositions, int page) { + var textPositionSequence = new TextPositionSequence(); + textPositionSequence.textPositions = textPositions; + textPositionSequence.page = page; + + return textPositionSequence; + } public TextPositionSequence(List textPositions, int page) { - this.textPositions = textPositions; + this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); this.page = page; } + + + @Override public int length() { @@ -34,7 +57,7 @@ public class TextPositionSequence implements CharSequence { @Override public char charAt(int index) { - TextPosition textPosition = textPositionAt(index); + RedTextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return text.charAt(0); } @@ -42,7 +65,7 @@ public class TextPositionSequence implements CharSequence { public char charAt(int index, boolean caseInSensitive) { - TextPosition textPosition = textPositionAt(index); + RedTextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0); } @@ -51,7 +74,7 @@ public class TextPositionSequence implements CharSequence { @Override public TextPositionSequence subSequence(int start, int end) { - return new TextPositionSequence(textPositions.subList(start, end), page); + return fromData(textPositions.subList(start, end), page); } @@ -66,18 +89,25 @@ public class TextPositionSequence implements CharSequence { } - public TextPosition textPositionAt(int index) { + public RedTextPosition textPositionAt(int index) { return textPositions.get(index); } - public void add(TextPosition textPosition) { + public void add(RedTextPosition textPosition) { this.textPositions.add(textPosition); } + public void add(TextPosition textPosition) { + + this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); + } + + + @JsonIgnore public float getX1() { if (textPositions.get(0).getRotation() == 90) { @@ -88,6 +118,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getX2() { if (textPositions.get(0).getRotation() == 90) { @@ -98,13 +129,14 @@ public class TextPositionSequence implements CharSequence { } } - + @JsonIgnore public float getRotationAdjustedY() { return textPositions.get(0).getY(); } + @JsonIgnore public float getY1() { if (textPositions.get(0).getRotation() == 90) { @@ -115,6 +147,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getY2() { if (textPositions.get(0).getRotation() == 90) { @@ -125,38 +158,40 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getTextHeight() { return textPositions.get(0).getHeightDir() + 2; } + @JsonIgnore public float getHeight() { return getY2() - getY1(); } + @JsonIgnore public float getWidth() { return getX2() - getX1(); } + @JsonIgnore public String getFont() { - - return textPositions.get(0) - .getFont() - .toString() + return textPositions.get(0).getFontName() .toLowerCase() .replaceAll(",bold", "") .replaceAll(",italic", ""); } + @JsonIgnore public String getFontStyle() { - String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase(); + String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) { return "bold, italic"; @@ -170,25 +205,25 @@ public class TextPositionSequence implements CharSequence { } - + @JsonIgnore public float getFontSize() { return textPositions.get(0).getFontSizeInPt(); } - + @JsonIgnore public float getSpaceWidth() { return textPositions.get(0).getWidthOfSpace(); } - + @JsonIgnore public int getRotation() { return textPositions.get(0).getRotation(); } - + @JsonIgnore public Rectangle getRectangle() { float height = getTextHeight(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java index 6d65518c..e38c8cf2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java @@ -3,19 +3,23 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; -import lombok.Value; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; -@Value +@Data +@NoArgsConstructor +@AllArgsConstructor public class CellValue { - private List textBlocks; + private List textBlocks = new ArrayList<>(); private int rowSpanStart; - @Override public String toString() { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java index e4e6167a..766d607d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java @@ -5,8 +5,6 @@ import lombok.Builder; import lombok.Data; import lombok.NoArgsConstructor; -import java.awt.geom.Rectangle2D; - @Data @Builder @NoArgsConstructor @@ -14,7 +12,7 @@ import java.awt.geom.Rectangle2D; public class Image { private String type; - private Rectangle2D position; + private RedRectangle2D position; private boolean redaction; private String redactionReason; private String legalBasis; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java index f7f6ad4f..1631717f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import lombok.Data; import lombok.NonNull; import lombok.RequiredArgsConstructor; @@ -11,9 +12,10 @@ import java.awt.image.BufferedImage; @RequiredArgsConstructor public class PdfImage { + @JsonIgnore private BufferedImage image; @NonNull - private Rectangle2D position; + private RedRectangle2D position; private ImageType imageType; private boolean isAppendedToParagraph; @@ -22,7 +24,7 @@ public class PdfImage { public PdfImage(BufferedImage image, Rectangle2D position, int page) { this.image = image; - this.position = position; + this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight()); this.page = page; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java new file mode 100644 index 00000000..601d328c --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java @@ -0,0 +1,35 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + + +import com.fasterxml.jackson.annotation.JsonIgnore; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +@AllArgsConstructor +public class RedRectangle2D { + + private double x; + private double y; + private double width; + private double height; + + @JsonIgnore + public boolean isEmpty() { + return width <= 0.0f || height <= 0.0f; + } + + public boolean contains(double x, double y, double w, double h) { + if (isEmpty() || w <= 0 || h <= 0) { + return false; + } + double x0 = getX(); + double y0 = getY(); + return x >= x0 && + y >= y0 && + (x + w) <= x0 + getWidth() && + (y + h) <= y0 + getHeight(); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 89ccf4a4..5ee4cb3f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -187,6 +187,7 @@ public class EntityRedactionService { .get(0) .getPage()); sectionText.getSectionAreas().add(sectionArea); + sectionText.getTextBlocks().addAll(cell.getTextBlocks()); addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue()); int cellStart = start; @@ -235,6 +236,8 @@ public class EntityRedactionService { sectionText.setHeadline(table.getHeadline()); sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setTable(true); + sectionText.setTabularData(tabularData); + sectionText.setCellStarts(cellStarts); classifiedDoc.getSectionText().add(sectionText); } @@ -267,6 +270,7 @@ public class EntityRedactionService { .getSequences() .get(0) .getPage()); + sectionText.getTextBlocks().addAll(cell.getTextBlocks()); sectionText.getSectionAreas().add(sectionArea); } @@ -325,6 +329,10 @@ public class EntityRedactionService { sectionText.setHeadline(headline); sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setTable(false); + sectionText.setImages(images.stream() + .map(image -> convert(image, sectionNumber.intValue(), headline)) + .collect(Collectors.toSet())); + sectionText.setTextBlocks(paragraphTextBlocks); classifiedDoc.getSectionText().add(sectionText); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index 2ebc57f4..a5bcd4f3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -12,12 +12,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.kie.api.runtime.KieContainer; import org.springframework.stereotype.Service; import org.springframework.web.bind.annotation.RequestBody; -import java.awt.geom.Rectangle2D; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -39,7 +39,6 @@ public class ReanalyzeService { public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) { - var pageCount = 0; Document classifiedDoc; @@ -74,30 +73,28 @@ public class ReanalyzeService { return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog); } - public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) { - var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId()); - // new procedure was not applied, we need a complete analysis + + @SneakyThrows + public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) { + + var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); + var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); + + // not yet ready for reanalysis if (text.getNumberOfPages() == 0) { - return analyze(AnalyzeRequest.builder() - .ruleSetId(renalyzeRequest.getRuleSetId()) - .manualRedactions(renalyzeRequest.getManualRedactions()) - .projectId(renalyzeRequest.getProjectId()) - .fileId(renalyzeRequest.getFileId()) - .build()); + return analyze(analyzeRequest); } - var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId()); + DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion()); - DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion()); - - Set manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions()); + Set manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions()); Map> comments = null; Set manualAdds = null; - if (renalyzeRequest.getManualRedactions() != null) { + if (analyzeRequest.getManualRedactions() != null) { // TODO comments will be removed from redactionLog, so we ignore this first. - comments = renalyzeRequest.getManualRedactions().getComments(); - manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd(); + comments = analyzeRequest.getManualRedactions().getComments(); + manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd(); } Set sectionsToReanalyse = new HashSet<>(); @@ -131,115 +128,114 @@ public class ReanalyzeService { } } + log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest); + if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) { redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); - var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog); - redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); + var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog); + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); } - try { + List reanalysisSections = new ArrayList<>(); - List reanalysisSections = new ArrayList<>(); - for (SectionText sectionText : text.getSectionTexts()) { - - if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) { - reanalysisSections.add(sectionText); - } + for (SectionText sectionText : text.getSectionTexts()) { + if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) { + reanalysisSections.add(sectionText); } - - KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId()); - - Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId()); - - List sectionSearchableTextPairs = new ArrayList<>(); - for (SectionText reanalysisSection : reanalysisSections) { - - Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection - .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); - if (reanalysisSection.getCellStarts() != null) { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection - .getCellStarts()); - } else { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); - } - - sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() - .isLocal(false) - .dictionaryTypes(dictionary.getTypes()) - .entities(entities) - .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) - .searchText(reanalysisSection.getSearchableText().toString()) - .headline(reanalysisSection.getHeadline()) - .sectionNumber(reanalysisSection.getSectionNumber()) - .tabularData(reanalysisSection.getTabularData()) - .searchableText(reanalysisSection.getSearchableText()) - .dictionary(dictionary) - .images(reanalysisSection.getImages()) - .build(), reanalysisSection.getSearchableText())); - } - - Set entities = new HashSet<>(); - Map> imagesPerPage = new HashMap<>(); - sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { - Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair - .getSection()); - entities.addAll(analysedRowSection.getEntities()); - EntitySearchUtils.removeEntitiesContainedInLarger(entities); - - for (Image image : analysedRowSection.getImages()) { - imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); - } - - }); - - Map> entitiesPerPage = new HashMap<>(); - for (Entity entity : entities) { - Map> sequenceOnPage = new HashMap<>(); - for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { - sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) - .add(entityPositionSequence); - } - - for (Map.Entry> entry : sequenceOnPage.entrySet()) { - entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) - .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry - .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity - .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity - .getStart(), entity.getEnd())); - } - } - - List newRedactionLogEntries = new ArrayList<>(); - for (int page = 1; page <= text.getNumberOfPages(); page++) { - if (entitiesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest - .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); - } - - if (imagesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest - .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); - } - - newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest - .getRuleSetId())); - } - - redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()); - redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); - redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); - - var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog); - redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); - return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); - - - } catch (Exception e) { - throw new RedactionException(e); } + + //-- + + KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId()); + + Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId()); + + List sectionSearchableTextPairs = new ArrayList<>(); + for (SectionText reanalysisSection : reanalysisSections) { + + Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection + .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); + if (reanalysisSection.getCellStarts() != null) { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection + .getCellStarts()); + } else { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); + } + + sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() + .isLocal(false) + .dictionaryTypes(dictionary.getTypes()) + .entities(entities) + .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) + .searchText(reanalysisSection.getSearchableText().toString()) + .headline(reanalysisSection.getHeadline()) + .sectionNumber(reanalysisSection.getSectionNumber()) + .tabularData(reanalysisSection.getTabularData()) + .searchableText(reanalysisSection.getSearchableText()) + .dictionary(dictionary) + .images(reanalysisSection.getImages()) + .build(), reanalysisSection.getSearchableText())); + } + + Set entities = new HashSet<>(); + Map> imagesPerPage = new HashMap<>(); + sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { + Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair + .getSection()); + entities.addAll(analysedRowSection.getEntities()); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); + + for (Image image : analysedRowSection.getImages()) { + imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); + } + + }); + + Map> entitiesPerPage = new HashMap<>(); + for (Entity entity : entities) { + Map> sequenceOnPage = new HashMap<>(); + for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) + .add(entityPositionSequence); + } + + for (Map.Entry> entry : sequenceOnPage.entrySet()) { + entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) + .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry + .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity + .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity + .getStart(), entity.getEnd())); + } + } + + List newRedactionLogEntries = new ArrayList<>(); + for (int page = 1; page <= text.getNumberOfPages(); page++) { + if (entitiesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest + .getManualRedactions(), page, analyzeRequest.getRuleSetId())); + } + + if (imagesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest + .getManualRedactions(), page, analyzeRequest.getRuleSetId())); + } + + newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest + .getRuleSetId())); + } + + + redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()); + redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); + redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); + + var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog); + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); + return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); + } @@ -262,7 +258,7 @@ public class ReanalyzeService { return Image.builder() .type(entry.getType()) - .position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft() + .position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft() .getY(), position.getWidth(), position.getHeight())) .sectionNumber(entry.getSectionNumber()) .section(entry.getSection()) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java index a046fb08..49bfe693 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java @@ -4,6 +4,7 @@ import com.iqser.red.service.redaction.v1.model.*; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; @@ -14,7 +15,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import lombok.RequiredArgsConstructor; import org.apache.commons.collections4.CollectionUtils; -import org.apache.pdfbox.text.TextPosition; import org.springframework.stereotype.Service; import java.util.ArrayList; @@ -272,24 +272,24 @@ public class RedactionLogCreatorService { } - private List getRectanglesPerLine(List textPositions, int page) { + private List getRectanglesPerLine(List textPositions, int page) { List rectangles = new ArrayList<>(); if (textPositions.size() == 1) { - rectangles.add(new TextPositionSequence(textPositions, page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle()); } else { float y = textPositions.get(0).getYDirAdj(); int startIndex = 0; for (int i = 1; i < textPositions.size(); i++) { float yDirAdj = textPositions.get(i).getYDirAdj(); if (yDirAdj != y) { - rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle()); y = yDirAdj; startIndex = i; } } if (startIndex != textPositions.size()) { - rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle()); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java index 34a712fe..241aa1be 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java @@ -3,9 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D; import lombok.experimental.UtilityClass; -import java.awt.geom.Rectangle2D; import java.nio.charset.StandardCharsets; import java.util.List; @@ -25,12 +25,8 @@ public class IdBuilder { } - public String buildId(Rectangle2D rectangle2D, int page) { - - StringBuilder sb = new StringBuilder(); - sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page); - - return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString(); + public String buildId(RedRectangle2D rectangle2D, int page) { + return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 2eb06c3d..be4fa972 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -1,21 +1,15 @@ package com.iqser.red.service.redaction.v1.server.segmentation; -import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService; -import com.iqser.red.service.redaction.v1.server.exception.RedactionException; import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; -import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; @@ -28,15 +22,12 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; -import java.awt.geom.Rectangle2D; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; @Slf4j @Service @@ -53,80 +44,11 @@ public class PdfSegmentationService { private final ImageClassificationService imageClassificationService; - private void postProcessSections(PDDocument pdDocument, List texts) { - - try { - for (SectionText sectionText : texts) { - - List textBlocks = new ArrayList<>(); - - Map> sectionAreasPerPage = new HashMap<>(); - for (SectionArea sectionArea : sectionText.getSectionAreas()) { - sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>()) - .add(sectionArea); - } - - Map tabularData = new HashMap<>(); - List cellStarts = new ArrayList<>(); - for (Integer page : sectionAreasPerPage.keySet()) { - List areasOnPage = sectionAreasPerPage.get(page); - - PDPage pdPage = pdDocument.getPage(page - 1); - PDRectangle cropBox = pdPage.getCropBox(); - PDFAreaTextStripper textStripper = new PDFAreaTextStripper(); - textStripper.setPageNumber(page); - - int cellStart = 0; - for (SectionArea sectionArea : areasOnPage) { - - Rectangle2D rect = null; - if (pdPage.getRotation() == 90) { - rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft() - .getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f); - } else { - rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft() - .getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea - .getHeight() + 0.001f); - } - - textStripper.addRegion(String.valueOf(1), rect); - textStripper.extractRegions(pdPage); - textStripper.getTextForRegion(String.valueOf(1)); - List positions = textStripper.getTextPositionSequences(); - - TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft() - .getX() + sectionArea.getWidth(), sectionArea.getTopLeft() - .getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0); - - if (sectionText.isTable()) { - Cell cell = new Cell(); - cell.addTextBlock(textBlock); - tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart)); - cellStarts.add(cellStart); - cellStart = cellStart + cell.toString().trim().length() + 1; - } - - textBlocks.add(textBlock); - textStripper.clearPositions(); - } - - } - sectionText.setTextBlocks(textBlocks); - sectionText.setTabularData(tabularData); - if (sectionText.isTable()) { - sectionText.setCellStarts(cellStarts); - } - } - - - } catch (Exception e) { - throw new RedactionException(e); - } - + public Document parseDocument(InputStream documentInputStream) throws IOException { + return parseDocument(documentInputStream, false); } - - public Document parseDocument(InputStream documentInputStream) throws IOException { + public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException { PDDocument pdDocument = null; try { //create tempFile @@ -166,24 +88,23 @@ public class PdfSegmentationService { Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings .getVertical()); + page.setRotation(rotation); - - tableExtractionService.extractTables(cleanRulings, page); - - buildPageStatistics(page); - page.setLandscape(isLandscape || isRotated); - page.setPageNumber(pageNumber); - increaseDocumentStatistics(page, document); - page.setImages(stripper.getImages()); - imageClassificationService.classifyImages(page); + tableExtractionService.extractTables(cleanRulings, page); + buildPageStatistics(page); + increaseDocumentStatistics(page, document); + + + if (!ignoreImages) { + imageClassificationService.classifyImages(page); + } pages.add(page); - } document.setPages(pages); @@ -194,9 +115,6 @@ public class PdfSegmentationService { pdDocument = reinitializePDDocument(tempFile, pdDocument); - // This can be improved an done in one pass, but it's complicated to do right away - postProcessSections(pdDocument, document.getSectionText()); - IOUtils.close(pdDocument); tempFile.delete(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java index c9792c0f..2c96bd05 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java @@ -50,7 +50,7 @@ public class RedactionStorageService { try { return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class); } catch (IOException e) { - throw new RuntimeException("Could not convert Text", e); + throw new RuntimeException("Could not convert RedactionLog", e); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java index 2f6183ab..b050e27b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.model.Rectangle; import lombok.AllArgsConstructor; import lombok.Data; @@ -27,10 +28,12 @@ public abstract class AbstractTextContainer { return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight(); } + @JsonIgnore public float getHeight() { return maxY - minY; } - + + @JsonIgnore public float getWidth() { return maxX - minX; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java similarity index 87% rename from redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java rename to redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java index cff5698f..e37034ce 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java @@ -12,11 +12,11 @@ import java.io.FileOutputStream; import java.util.HashMap; import java.util.Map; -public class FilySystemBackedStorageService extends StorageService { +public class FileSystemBackedStorageService extends StorageService { - private Map dataMap = new HashMap<>(); + private final Map dataMap = new HashMap<>(); - public FilySystemBackedStorageService() { + public FileSystemBackedStorageService() { super(null, null); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 2a998b14..c74b653e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -134,7 +134,7 @@ public class RedactionIntegrationTest { @Bean @Primary public StorageService inmemoryStorage() { - return new FilySystemBackedStorageService(); + return new FileSystemBackedStorageService(); } } @@ -142,8 +142,8 @@ public class RedactionIntegrationTest { @After public void cleanupStorage() { - if (this.storageService instanceof FilySystemBackedStorageService) { - ((FilySystemBackedStorageService) this.storageService).clearStorage(); + if (this.storageService instanceof FileSystemBackedStorageService) { + ((FileSystemBackedStorageService) this.storageService).clearStorage(); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index 8c19e0d6..32fe65ee 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -2,7 +2,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; import com.amazonaws.services.s3.AmazonS3; import com.iqser.red.service.configuration.v1.api.model.*; -import com.iqser.red.service.redaction.v1.server.FilySystemBackedStorageService; +import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient; @@ -97,7 +97,7 @@ public class EntityRedactionServiceTest { @Bean @Primary public StorageService inmemoryStorage() { - return new FilySystemBackedStorageService(); + return new FileSystemBackedStorageService(); } }