From 5c2596e268e860ddabb871dc642ec85a08bf9bcd Mon Sep 17 00:00:00 2001 From: Timo Date: Mon, 19 Apr 2021 13:08:32 +0300 Subject: [PATCH 01/10] Serialization of text --- .../classification/model/SectionText.java | 6 +++ .../classification/model/TextBlock.java | 2 + .../ObjectMapperConfiguration.java | 43 ++++++++++++++++ .../v1/server/parsing/model/RedMatrix.java | 10 ++++ .../server/parsing/model/RedTextPosition.java | 48 +++++++++++++++++ .../parsing/model/TextPositionSequence.java | 51 ++++++++++++++----- .../redaction/service/ReanalyzeService.java | 15 ++++++ .../service/RedactionLogCreatorService.java | 10 ++-- .../segmentation/PdfSegmentationService.java | 5 +- 9 files changed, 167 insertions(+), 23 deletions(-) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java index c9c88cec..f62091a7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java @@ -31,6 +31,12 @@ public class SectionText { private List cellStarts = new ArrayList<>(); + public void setTabularData(Map tabularData) { + tabularData.remove(null); + this.tabularData = tabularData; + } + + public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java index 6da9f6a0..d40f8671 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java @@ -6,6 +6,7 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; +import lombok.NoArgsConstructor; import java.util.ArrayList; import java.util.List; @@ -13,6 +14,7 @@ import java.util.List; @AllArgsConstructor @Builder @Data +@NoArgsConstructor public class TextBlock extends AbstractTextContainer { @Builder.Default diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java new file mode 100644 index 00000000..44d3c2b7 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java @@ -0,0 +1,43 @@ +package com.iqser.red.service.redaction.v1.server.configuration; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.Version; +import com.fasterxml.jackson.databind.JsonSerializer; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.module.SimpleModule; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Primary; + +import java.io.IOException; + +@Configuration +public class ObjectMapperConfiguration { + + + @Bean + @Primary + public ObjectMapper objectMapper() { + var objectMapper = new ObjectMapper(); + SimpleModule simpleModule = new SimpleModule("SimpleModule", + new Version(1, 0, 0, null)); +// simpleModule.addSerializer(new ItemSerializer()); + simpleModule.addSerializer(PDFont.class, new PDFontSerializer()); + simpleModule.addSerializer(PDTrueTypeFont.class, new PDFontSerializer()); + objectMapper.registerModule(simpleModule); + + return objectMapper; + } + + + public static class PDFontSerializer extends JsonSerializer { + + @Override + public void serialize(PDFont t, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException { + jsonGenerator.writeNull(); + } + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java new file mode 100644 index 00000000..f58c7475 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java @@ -0,0 +1,10 @@ +package com.iqser.red.service.redaction.v1.server.parsing.model; + +import lombok.Data; + + +@Data +public class RedMatrix { + + private float[] single; +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java new file mode 100644 index 00000000..effafad8 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java @@ -0,0 +1,48 @@ +package com.iqser.red.service.redaction.v1.server.parsing.model; + +import lombok.Data; +import lombok.SneakyThrows; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.util.Matrix; +import org.springframework.beans.BeanUtils; + +@Data +public class RedTextPosition { + + private Matrix textMatrix; + private float endX; + private float endY; + private float maxHeight; + private int rotation; + private float x; + private float y; + private float pageHeight; + private float pageWidth; + private float widthOfSpace; + private int[] charCodes; + private float fontSize; + private float fontSizePt; + private float[] widths; + private String unicode; + private float direction = -1.0F; + private float XDirAdj; + private float YDirAdj; + private float width; + private float heightDir; + private float fontSizeInPt; + private String fontName; + + + @SneakyThrows + public static RedTextPosition fromTextPosition(TextPosition textPosition) { + var pos = new RedTextPosition(); + BeanUtils.copyProperties(textPosition, pos); + pos.setFontName(textPosition.getFont().getName()); + + pos.setCharCodes(textPosition.getCharacterCodes()); + pos.setWidths(textPosition.getIndividualWidths()); + pos.setFontSizePt(textPosition.getFontSizeInPt()); + + return pos; + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java index c6181f4e..786e1483 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java @@ -3,26 +3,45 @@ package com.iqser.red.service.redaction.v1.server.parsing.model; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; import lombok.Data; -import lombok.RequiredArgsConstructor; +import lombok.NoArgsConstructor; import org.apache.pdfbox.text.TextPosition; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; @Data -@RequiredArgsConstructor +@NoArgsConstructor public class TextPositionSequence implements CharSequence { - private final int page; - private List textPositions = new ArrayList<>(); + private int page; + private List textPositions = new ArrayList<>(); + + + public TextPositionSequence(int page) { + this.page = page; + } + + + public static TextPositionSequence fromData(List textPositions, int page) { + var textPositionSequence = new TextPositionSequence(); + textPositionSequence.textPositions = textPositions; + textPositionSequence.page = page; + + return textPositionSequence; + } public TextPositionSequence(List textPositions, int page) { - this.textPositions = textPositions; + this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); this.page = page; } + public void setTextPositions(List textPositions) { + this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); + } + @Override public int length() { @@ -34,7 +53,7 @@ public class TextPositionSequence implements CharSequence { @Override public char charAt(int index) { - TextPosition textPosition = textPositionAt(index); + RedTextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return text.charAt(0); } @@ -42,7 +61,7 @@ public class TextPositionSequence implements CharSequence { public char charAt(int index, boolean caseInSensitive) { - TextPosition textPosition = textPositionAt(index); + RedTextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0); } @@ -51,7 +70,7 @@ public class TextPositionSequence implements CharSequence { @Override public TextPositionSequence subSequence(int start, int end) { - return new TextPositionSequence(textPositions.subList(start, end), page); + return fromData(textPositions.subList(start, end), page); } @@ -66,18 +85,24 @@ public class TextPositionSequence implements CharSequence { } - public TextPosition textPositionAt(int index) { + public RedTextPosition textPositionAt(int index) { return textPositions.get(index); } - public void add(TextPosition textPosition) { + public void add(RedTextPosition textPosition) { this.textPositions.add(textPosition); } + public void add(TextPosition textPosition) { + + this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); + } + + public float getX1() { if (textPositions.get(0).getRotation() == 90) { @@ -145,9 +170,7 @@ public class TextPositionSequence implements CharSequence { public String getFont() { - return textPositions.get(0) - .getFont() - .toString() + return textPositions.get(0).getFontName() .toLowerCase() .replaceAll(",bold", "") .replaceAll(",italic", ""); @@ -156,7 +179,7 @@ public class TextPositionSequence implements CharSequence { public String getFontStyle() { - String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase(); + String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) { return "bold, italic"; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index 2ebc57f4..d610e94e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -13,11 +13,14 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; import org.kie.api.runtime.KieContainer; import org.springframework.stereotype.Service; import org.springframework.web.bind.annotation.RequestBody; import java.awt.geom.Rectangle2D; +import java.io.IOException; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -53,6 +56,18 @@ public class ReanalyzeService { log.info("Document structure analysis successful, starting redaction analysis..."); entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions()); + + var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN)); + + // TODO move this to where it makes sense - or remove completly + try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { + pdDocument.setAllSecurityToBeRemoved(true); + pdfSegmentationService.postProcessSections(pdDocument, classifiedDoc.getSectionText()); + } catch (IOException e) { + e.printStackTrace(); + } + + redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest .getRuleSetId()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java index a046fb08..49bfe693 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java @@ -4,6 +4,7 @@ import com.iqser.red.service.redaction.v1.model.*; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; @@ -14,7 +15,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import lombok.RequiredArgsConstructor; import org.apache.commons.collections4.CollectionUtils; -import org.apache.pdfbox.text.TextPosition; import org.springframework.stereotype.Service; import java.util.ArrayList; @@ -272,24 +272,24 @@ public class RedactionLogCreatorService { } - private List getRectanglesPerLine(List textPositions, int page) { + private List getRectanglesPerLine(List textPositions, int page) { List rectangles = new ArrayList<>(); if (textPositions.size() == 1) { - rectangles.add(new TextPositionSequence(textPositions, page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle()); } else { float y = textPositions.get(0).getYDirAdj(); int startIndex = 0; for (int i = 1; i < textPositions.size(); i++) { float yDirAdj = textPositions.get(i).getYDirAdj(); if (yDirAdj != y) { - rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle()); y = yDirAdj; startIndex = i; } } if (startIndex != textPositions.size()) { - rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle()); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 2eb06c3d..12fc519a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -53,7 +53,7 @@ public class PdfSegmentationService { private final ImageClassificationService imageClassificationService; - private void postProcessSections(PDDocument pdDocument, List texts) { + public void postProcessSections(PDDocument pdDocument, List texts) { try { for (SectionText sectionText : texts) { @@ -194,9 +194,6 @@ public class PdfSegmentationService { pdDocument = reinitializePDDocument(tempFile, pdDocument); - // This can be improved an done in one pass, but it's complicated to do right away - postProcessSections(pdDocument, document.getSectionText()); - IOUtils.close(pdDocument); tempFile.delete(); From 1d4708ad134eeeae84fb07550985865afd7fc598 Mon Sep 17 00:00:00 2001 From: Timo Date: Tue, 20 Apr 2021 09:51:50 +0300 Subject: [PATCH 02/10] reworked reanalysis and text storage --- .../server/classification/model/Footer.java | 3 +- .../server/classification/model/Header.java | 3 +- .../classification/model/SectionText.java | 3 +- .../classification/model/TextBlock.java | 2 + .../model/UnclassifiedText.java | 3 +- .../v1/server/parsing/model/RedMatrix.java | 10 - .../server/parsing/model/RedTextPosition.java | 32 +-- .../parsing/model/TextPositionSequence.java | 30 ++- .../v1/server/redaction/model/Image.java | 4 +- .../v1/server/redaction/model/PdfImage.java | 6 +- .../redaction/model/RedRectangle2D.java | 35 +++ .../service/EntityRedactionService.java | 8 + .../redaction/service/ReanalyzeService.java | 246 ++++++++---------- .../v1/server/redaction/utils/IdBuilder.java | 10 +- .../segmentation/PdfSegmentationService.java | 83 +----- .../model/AbstractTextContainer.java | 5 +- .../FilySystemBackedStorageService.java | 2 +- .../v1/server/RedactionIntegrationTest.java | 10 + 18 files changed, 228 insertions(+), 267 deletions(-) delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java index 61d12a43..b88a16b7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +13,7 @@ public class Footer { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java index f3067452..133e0245 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +13,7 @@ public class Header { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java index f62091a7..77649132 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; import com.iqser.red.service.redaction.v1.server.redaction.model.Image; @@ -36,7 +37,7 @@ public class SectionText { this.tabularData = tabularData; } - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java index d40f8671..63cfc11c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; @@ -118,6 +119,7 @@ public class TextBlock extends AbstractTextContainer { } @Override + @JsonIgnore public String getText() { StringBuilder sb = new StringBuilder(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java index 79277b9e..0d51a4f8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +13,7 @@ public class UnclassifiedText { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java deleted file mode 100644 index f58c7475..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.parsing.model; - -import lombok.Data; - - -@Data -public class RedMatrix { - - private float[] single; -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java index effafad8..04e394f8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java @@ -1,35 +1,35 @@ package com.iqser.red.service.redaction.v1.server.parsing.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import lombok.Data; import lombok.SneakyThrows; import org.apache.pdfbox.text.TextPosition; -import org.apache.pdfbox.util.Matrix; import org.springframework.beans.BeanUtils; @Data public class RedTextPosition { - private Matrix textMatrix; - private float endX; - private float endY; - private float maxHeight; + private String textMatrix; private int rotation; - private float x; private float y; private float pageHeight; private float pageWidth; - private float widthOfSpace; - private int[] charCodes; - private float fontSize; - private float fontSizePt; - private float[] widths; private String unicode; - private float direction = -1.0F; private float XDirAdj; private float YDirAdj; private float width; private float heightDir; + + // not used in reanalysis + @JsonIgnore + private float widthOfSpace; + + // not used in reanalysis + @JsonIgnore private float fontSizeInPt; + + // not used in reanalysis + @JsonIgnore private String fontName; @@ -39,10 +39,12 @@ public class RedTextPosition { BeanUtils.copyProperties(textPosition, pos); pos.setFontName(textPosition.getFont().getName()); - pos.setCharCodes(textPosition.getCharacterCodes()); - pos.setWidths(textPosition.getIndividualWidths()); - pos.setFontSizePt(textPosition.getFontSizeInPt()); + pos.setFontSizeInPt(textPosition.getFontSizeInPt()); + + pos.setTextMatrix(textPosition.getTextMatrix().toString()); return pos; } + + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java index 786e1483..10b5abb1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java @@ -1,5 +1,7 @@ package com.iqser.red.service.redaction.v1.server.parsing.model; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; import lombok.Data; @@ -12,11 +14,14 @@ import java.util.stream.Collectors; @Data @NoArgsConstructor +@JsonIgnoreProperties({ "empty" }) public class TextPositionSequence implements CharSequence { private int page; private List textPositions = new ArrayList<>(); + private float x1; + private float x2; public TextPositionSequence(int page) { this.page = page; @@ -38,9 +43,8 @@ public class TextPositionSequence implements CharSequence { this.page = page; } - public void setTextPositions(List textPositions) { - this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); - } + + @Override @@ -103,6 +107,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getX1() { if (textPositions.get(0).getRotation() == 90) { @@ -113,6 +118,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getX2() { if (textPositions.get(0).getRotation() == 90) { @@ -123,13 +129,14 @@ public class TextPositionSequence implements CharSequence { } } - + @JsonIgnore public float getRotationAdjustedY() { return textPositions.get(0).getY(); } + @JsonIgnore public float getY1() { if (textPositions.get(0).getRotation() == 90) { @@ -140,6 +147,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getY2() { if (textPositions.get(0).getRotation() == 90) { @@ -150,26 +158,29 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getTextHeight() { return textPositions.get(0).getHeightDir() + 2; } + @JsonIgnore public float getHeight() { return getY2() - getY1(); } + @JsonIgnore public float getWidth() { return getX2() - getX1(); } + @JsonIgnore public String getFont() { - return textPositions.get(0).getFontName() .toLowerCase() .replaceAll(",bold", "") @@ -177,6 +188,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public String getFontStyle() { String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); @@ -193,25 +205,25 @@ public class TextPositionSequence implements CharSequence { } - + @JsonIgnore public float getFontSize() { return textPositions.get(0).getFontSizeInPt(); } - + @JsonIgnore public float getSpaceWidth() { return textPositions.get(0).getWidthOfSpace(); } - + @JsonIgnore public int getRotation() { return textPositions.get(0).getRotation(); } - + @JsonIgnore public Rectangle getRectangle() { float height = getTextHeight(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java index e4e6167a..766d607d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java @@ -5,8 +5,6 @@ import lombok.Builder; import lombok.Data; import lombok.NoArgsConstructor; -import java.awt.geom.Rectangle2D; - @Data @Builder @NoArgsConstructor @@ -14,7 +12,7 @@ import java.awt.geom.Rectangle2D; public class Image { private String type; - private Rectangle2D position; + private RedRectangle2D position; private boolean redaction; private String redactionReason; private String legalBasis; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java index f7f6ad4f..1631717f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import lombok.Data; import lombok.NonNull; import lombok.RequiredArgsConstructor; @@ -11,9 +12,10 @@ import java.awt.image.BufferedImage; @RequiredArgsConstructor public class PdfImage { + @JsonIgnore private BufferedImage image; @NonNull - private Rectangle2D position; + private RedRectangle2D position; private ImageType imageType; private boolean isAppendedToParagraph; @@ -22,7 +24,7 @@ public class PdfImage { public PdfImage(BufferedImage image, Rectangle2D position, int page) { this.image = image; - this.position = position; + this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight()); this.page = page; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java new file mode 100644 index 00000000..d42b76ff --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java @@ -0,0 +1,35 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + + +import com.fasterxml.jackson.annotation.JsonIgnore; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +@AllArgsConstructor +public class RedRectangle2D { + + private double x; + private double y; + private double width; + private double height; + + @JsonIgnore + public boolean isEmpty() { + return (width <= 0.0f) || (height <= 0.0f); + } + + public boolean contains(double x, double y, double w, double h) { + if (isEmpty() || w <= 0 || h <= 0) { + return false; + } + double x0 = getX(); + double y0 = getY(); + return (x >= x0 && + y >= y0 && + (x + w) <= x0 + getWidth() && + (y + h) <= y0 + getHeight()); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 89ccf4a4..5ee4cb3f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -187,6 +187,7 @@ public class EntityRedactionService { .get(0) .getPage()); sectionText.getSectionAreas().add(sectionArea); + sectionText.getTextBlocks().addAll(cell.getTextBlocks()); addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue()); int cellStart = start; @@ -235,6 +236,8 @@ public class EntityRedactionService { sectionText.setHeadline(table.getHeadline()); sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setTable(true); + sectionText.setTabularData(tabularData); + sectionText.setCellStarts(cellStarts); classifiedDoc.getSectionText().add(sectionText); } @@ -267,6 +270,7 @@ public class EntityRedactionService { .getSequences() .get(0) .getPage()); + sectionText.getTextBlocks().addAll(cell.getTextBlocks()); sectionText.getSectionAreas().add(sectionArea); } @@ -325,6 +329,10 @@ public class EntityRedactionService { sectionText.setHeadline(headline); sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setTable(false); + sectionText.setImages(images.stream() + .map(image -> convert(image, sectionNumber.intValue(), headline)) + .collect(Collectors.toSet())); + sectionText.setTextBlocks(paragraphTextBlocks); classifiedDoc.getSectionText().add(sectionText); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index d610e94e..90c964fb 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -12,15 +12,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import org.apache.pdfbox.io.MemoryUsageSetting; -import org.apache.pdfbox.pdmodel.PDDocument; import org.kie.api.runtime.KieContainer; import org.springframework.stereotype.Service; import org.springframework.web.bind.annotation.RequestBody; -import java.awt.geom.Rectangle2D; -import java.io.IOException; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -42,7 +39,6 @@ public class ReanalyzeService { public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) { - var pageCount = 0; Document classifiedDoc; @@ -56,18 +52,6 @@ public class ReanalyzeService { log.info("Document structure analysis successful, starting redaction analysis..."); entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions()); - - var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN)); - - // TODO move this to where it makes sense - or remove completly - try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { - pdDocument.setAllSecurityToBeRemoved(true); - pdfSegmentationService.postProcessSections(pdDocument, classifiedDoc.getSectionText()); - } catch (IOException e) { - e.printStackTrace(); - } - - redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest .getRuleSetId()); @@ -89,30 +73,28 @@ public class ReanalyzeService { return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog); } - public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) { - var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId()); - // new procedure was not applied, we need a complete analysis + + @SneakyThrows + public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) { + + var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); + var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); + + // not yet ready for reanalysis if (text.getNumberOfPages() == 0) { - return analyze(AnalyzeRequest.builder() - .ruleSetId(renalyzeRequest.getRuleSetId()) - .manualRedactions(renalyzeRequest.getManualRedactions()) - .projectId(renalyzeRequest.getProjectId()) - .fileId(renalyzeRequest.getFileId()) - .build()); + return analyze(analyzeRequest); } - var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId()); + DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion()); - DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion()); - - Set manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions()); + Set manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions()); Map> comments = null; Set manualAdds = null; - if (renalyzeRequest.getManualRedactions() != null) { + if (analyzeRequest.getManualRedactions() != null) { // TODO comments will be removed from redactionLog, so we ignore this first. - comments = renalyzeRequest.getManualRedactions().getComments(); - manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd(); + comments = analyzeRequest.getManualRedactions().getComments(); + manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd(); } Set sectionsToReanalyse = new HashSet<>(); @@ -146,115 +128,113 @@ public class ReanalyzeService { } } + if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) { redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); - var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog); - redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); + var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog); + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); } - try { + List reanalysisSections = new ArrayList<>(); - List reanalysisSections = new ArrayList<>(); - for (SectionText sectionText : text.getSectionTexts()) { - - if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) { - reanalysisSections.add(sectionText); - } + for (SectionText sectionText : text.getSectionTexts()) { + if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) { + reanalysisSections.add(sectionText); } - - KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId()); - - Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId()); - - List sectionSearchableTextPairs = new ArrayList<>(); - for (SectionText reanalysisSection : reanalysisSections) { - - Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection - .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); - if (reanalysisSection.getCellStarts() != null) { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection - .getCellStarts()); - } else { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); - } - - sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() - .isLocal(false) - .dictionaryTypes(dictionary.getTypes()) - .entities(entities) - .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) - .searchText(reanalysisSection.getSearchableText().toString()) - .headline(reanalysisSection.getHeadline()) - .sectionNumber(reanalysisSection.getSectionNumber()) - .tabularData(reanalysisSection.getTabularData()) - .searchableText(reanalysisSection.getSearchableText()) - .dictionary(dictionary) - .images(reanalysisSection.getImages()) - .build(), reanalysisSection.getSearchableText())); - } - - Set entities = new HashSet<>(); - Map> imagesPerPage = new HashMap<>(); - sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { - Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair - .getSection()); - entities.addAll(analysedRowSection.getEntities()); - EntitySearchUtils.removeEntitiesContainedInLarger(entities); - - for (Image image : analysedRowSection.getImages()) { - imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); - } - - }); - - Map> entitiesPerPage = new HashMap<>(); - for (Entity entity : entities) { - Map> sequenceOnPage = new HashMap<>(); - for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { - sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) - .add(entityPositionSequence); - } - - for (Map.Entry> entry : sequenceOnPage.entrySet()) { - entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) - .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry - .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity - .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity - .getStart(), entity.getEnd())); - } - } - - List newRedactionLogEntries = new ArrayList<>(); - for (int page = 1; page <= text.getNumberOfPages(); page++) { - if (entitiesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest - .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); - } - - if (imagesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest - .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); - } - - newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest - .getRuleSetId())); - } - - redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()); - redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); - redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); - - var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog); - redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); - return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); - - - } catch (Exception e) { - throw new RedactionException(e); } + + //-- + + KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId()); + + Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId()); + + List sectionSearchableTextPairs = new ArrayList<>(); + for (SectionText reanalysisSection : reanalysisSections) { + + Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection + .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); + if (reanalysisSection.getCellStarts() != null) { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection + .getCellStarts()); + } else { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); + } + + sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() + .isLocal(false) + .dictionaryTypes(dictionary.getTypes()) + .entities(entities) + .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) + .searchText(reanalysisSection.getSearchableText().toString()) + .headline(reanalysisSection.getHeadline()) + .sectionNumber(reanalysisSection.getSectionNumber()) + .tabularData(reanalysisSection.getTabularData()) + .searchableText(reanalysisSection.getSearchableText()) + .dictionary(dictionary) + .images(reanalysisSection.getImages()) + .build(), reanalysisSection.getSearchableText())); + } + + Set entities = new HashSet<>(); + Map> imagesPerPage = new HashMap<>(); + sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { + Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair + .getSection()); + entities.addAll(analysedRowSection.getEntities()); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); + + for (Image image : analysedRowSection.getImages()) { + imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); + } + + }); + + Map> entitiesPerPage = new HashMap<>(); + for (Entity entity : entities) { + Map> sequenceOnPage = new HashMap<>(); + for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) + .add(entityPositionSequence); + } + + for (Map.Entry> entry : sequenceOnPage.entrySet()) { + entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) + .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry + .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity + .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity + .getStart(), entity.getEnd())); + } + } + + List newRedactionLogEntries = new ArrayList<>(); + for (int page = 1; page <= text.getNumberOfPages(); page++) { + if (entitiesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest + .getManualRedactions(), page, analyzeRequest.getRuleSetId())); + } + + if (imagesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest + .getManualRedactions(), page, analyzeRequest.getRuleSetId())); + } + + newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest + .getRuleSetId())); + } + + + redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()); + redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); + redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); + + var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog); + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); + return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); + } @@ -277,7 +257,7 @@ public class ReanalyzeService { return Image.builder() .type(entry.getType()) - .position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft() + .position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft() .getY(), position.getWidth(), position.getHeight())) .sectionNumber(entry.getSectionNumber()) .section(entry.getSection()) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java index 34a712fe..241aa1be 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java @@ -3,9 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D; import lombok.experimental.UtilityClass; -import java.awt.geom.Rectangle2D; import java.nio.charset.StandardCharsets; import java.util.List; @@ -25,12 +25,8 @@ public class IdBuilder { } - public String buildId(Rectangle2D rectangle2D, int page) { - - StringBuilder sb = new StringBuilder(); - sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page); - - return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString(); + public String buildId(RedRectangle2D rectangle2D, int page) { + return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 12fc519a..80129054 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -1,21 +1,15 @@ package com.iqser.red.service.redaction.v1.server.segmentation; -import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService; -import com.iqser.red.service.redaction.v1.server.exception.RedactionException; import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; -import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; @@ -28,15 +22,12 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; -import java.awt.geom.Rectangle2D; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; @Slf4j @Service @@ -53,79 +44,6 @@ public class PdfSegmentationService { private final ImageClassificationService imageClassificationService; - public void postProcessSections(PDDocument pdDocument, List texts) { - - try { - for (SectionText sectionText : texts) { - - List textBlocks = new ArrayList<>(); - - Map> sectionAreasPerPage = new HashMap<>(); - for (SectionArea sectionArea : sectionText.getSectionAreas()) { - sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>()) - .add(sectionArea); - } - - Map tabularData = new HashMap<>(); - List cellStarts = new ArrayList<>(); - for (Integer page : sectionAreasPerPage.keySet()) { - List areasOnPage = sectionAreasPerPage.get(page); - - PDPage pdPage = pdDocument.getPage(page - 1); - PDRectangle cropBox = pdPage.getCropBox(); - PDFAreaTextStripper textStripper = new PDFAreaTextStripper(); - textStripper.setPageNumber(page); - - int cellStart = 0; - for (SectionArea sectionArea : areasOnPage) { - - Rectangle2D rect = null; - if (pdPage.getRotation() == 90) { - rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft() - .getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f); - } else { - rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft() - .getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea - .getHeight() + 0.001f); - } - - textStripper.addRegion(String.valueOf(1), rect); - textStripper.extractRegions(pdPage); - textStripper.getTextForRegion(String.valueOf(1)); - List positions = textStripper.getTextPositionSequences(); - - TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft() - .getX() + sectionArea.getWidth(), sectionArea.getTopLeft() - .getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0); - - if (sectionText.isTable()) { - Cell cell = new Cell(); - cell.addTextBlock(textBlock); - tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart)); - cellStarts.add(cellStart); - cellStart = cellStart + cell.toString().trim().length() + 1; - } - - textBlocks.add(textBlock); - textStripper.clearPositions(); - } - - } - sectionText.setTextBlocks(textBlocks); - sectionText.setTabularData(tabularData); - if (sectionText.isTable()) { - sectionText.setCellStarts(cellStarts); - } - } - - - } catch (Exception e) { - throw new RedactionException(e); - } - - } - - public Document parseDocument(InputStream documentInputStream) throws IOException { PDDocument pdDocument = null; try { @@ -141,6 +59,7 @@ public class PdfSegmentationService { pdDocument = reinitializePDDocument(tempFile, null); long pageCount = pdDocument.getNumberOfPages(); + long t1= System.currentTimeMillis(); for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { if (pageNumber % MAX_PAGES_BEFORE_GC == 0) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java index 2f6183ab..b050e27b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.model.Rectangle; import lombok.AllArgsConstructor; import lombok.Data; @@ -27,10 +28,12 @@ public abstract class AbstractTextContainer { return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight(); } + @JsonIgnore public float getHeight() { return maxY - minY; } - + + @JsonIgnore public float getWidth() { return maxX - minX; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java index cff5698f..9cc8537f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java @@ -44,7 +44,7 @@ public class FilySystemBackedStorageService extends StorageService { public void clearStorage() { this.dataMap.forEach((k, v) -> { - v.delete(); + // v.delete(); }); this.dataMap.clear(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 2a998b14..7200b7a7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -458,6 +458,16 @@ public class RedactionIntegrationTest { assertThat(result).isNotNull(); } + @Test + public void testXXX() { + AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf"); + MemoryStats.printMemoryStats(); + AnalyzeResult result = redactionController.analyze(request); + assertThat(result).isNotNull(); + } + + + @Test public void noExceptionShouldBeThrownForAnyFiles() throws IOException { From ba28a3e0d31d9489e0be8be34ddfc58e8940fc5d Mon Sep 17 00:00:00 2001 From: Timo Date: Tue, 20 Apr 2021 10:26:27 +0300 Subject: [PATCH 03/10] code format, dependecy and test update, logging for reanalysis --- redaction-service-v1/pom.xml | 2 +- .../v1/server/parsing/model/RedTextPosition.java | 2 ++ .../v1/server/redaction/model/CellValue.java | 12 ++++++++---- .../redaction/service/ReanalyzeService.java | 1 + ....java => FileSystemBackedStorageService.java} | 8 ++++---- .../v1/server/RedactionIntegrationTest.java | 16 +++------------- .../service/EntityRedactionServiceTest.java | 4 ++-- 7 files changed, 21 insertions(+), 24 deletions(-) rename redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/{FilySystemBackedStorageService.java => FileSystemBackedStorageService.java} (85%) diff --git a/redaction-service-v1/pom.xml b/redaction-service-v1/pom.xml index 124e5ae4..501ae19c 100644 --- a/redaction-service-v1/pom.xml +++ b/redaction-service-v1/pom.xml @@ -32,7 +32,7 @@ com.iqser.red platform-commons-dependency - 1.2.9 + 1.3.0 import pom diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java index 04e394f8..d8e72d22 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java @@ -2,11 +2,13 @@ package com.iqser.red.service.redaction.v1.server.parsing.model; import com.fasterxml.jackson.annotation.JsonIgnore; import lombok.Data; +import lombok.NoArgsConstructor; import lombok.SneakyThrows; import org.apache.pdfbox.text.TextPosition; import org.springframework.beans.BeanUtils; @Data +@NoArgsConstructor public class RedTextPosition { private String textMatrix; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java index 6d65518c..e38c8cf2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java @@ -3,19 +3,23 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; -import lombok.Value; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; -@Value +@Data +@NoArgsConstructor +@AllArgsConstructor public class CellValue { - private List textBlocks; + private List textBlocks = new ArrayList<>(); private int rowSpanStart; - @Override public String toString() { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index 90c964fb..a5bcd4f3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -128,6 +128,7 @@ public class ReanalyzeService { } } + log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest); if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) { redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java similarity index 85% rename from redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java rename to redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java index 9cc8537f..e37034ce 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FileSystemBackedStorageService.java @@ -12,11 +12,11 @@ import java.io.FileOutputStream; import java.util.HashMap; import java.util.Map; -public class FilySystemBackedStorageService extends StorageService { +public class FileSystemBackedStorageService extends StorageService { - private Map dataMap = new HashMap<>(); + private final Map dataMap = new HashMap<>(); - public FilySystemBackedStorageService() { + public FileSystemBackedStorageService() { super(null, null); } @@ -44,7 +44,7 @@ public class FilySystemBackedStorageService extends StorageService { public void clearStorage() { this.dataMap.forEach((k, v) -> { - // v.delete(); + v.delete(); }); this.dataMap.clear(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 7200b7a7..c74b653e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -134,7 +134,7 @@ public class RedactionIntegrationTest { @Bean @Primary public StorageService inmemoryStorage() { - return new FilySystemBackedStorageService(); + return new FileSystemBackedStorageService(); } } @@ -142,8 +142,8 @@ public class RedactionIntegrationTest { @After public void cleanupStorage() { - if (this.storageService instanceof FilySystemBackedStorageService) { - ((FilySystemBackedStorageService) this.storageService).clearStorage(); + if (this.storageService instanceof FileSystemBackedStorageService) { + ((FileSystemBackedStorageService) this.storageService).clearStorage(); } } @@ -458,16 +458,6 @@ public class RedactionIntegrationTest { assertThat(result).isNotNull(); } - @Test - public void testXXX() { - AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf"); - MemoryStats.printMemoryStats(); - AnalyzeResult result = redactionController.analyze(request); - assertThat(result).isNotNull(); - } - - - @Test public void noExceptionShouldBeThrownForAnyFiles() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index 8c19e0d6..32fe65ee 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -2,7 +2,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; import com.amazonaws.services.s3.AmazonS3; import com.iqser.red.service.configuration.v1.api.model.*; -import com.iqser.red.service.redaction.v1.server.FilySystemBackedStorageService; +import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient; @@ -97,7 +97,7 @@ public class EntityRedactionServiceTest { @Bean @Primary public StorageService inmemoryStorage() { - return new FilySystemBackedStorageService(); + return new FileSystemBackedStorageService(); } } From 15f911ee73e653f9c8aab63475fd15c2f483255c Mon Sep 17 00:00:00 2001 From: Timo Date: Tue, 20 Apr 2021 10:28:40 +0300 Subject: [PATCH 04/10] run tests with real life jvm args to detect oom issues early --- bamboo-specs/src/main/java/buildjob/PlanSpec.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index d6d0ea67..e45c395b 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -1,7 +1,5 @@ package buildjob; -import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask; - import com.atlassian.bamboo.specs.api.BambooSpec; import com.atlassian.bamboo.specs.api.builders.BambooKey; import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration; @@ -24,6 +22,8 @@ import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger; import com.atlassian.bamboo.specs.model.task.InjectVariablesScope; import com.atlassian.bamboo.specs.util.BambooServer; +import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask; + /** * Plan configuration for Bamboo. * Learn more on: https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs @@ -33,6 +33,8 @@ public class PlanSpec { private static final String SERVICE_NAME = "redaction-service"; + private static final String JVM_ARGS =" -Xmx4g -XX:+ExitOnOutOfMemoryError -XX:SurvivorRatio=2 -XX:NewRatio=1 -XX:InitialTenuringThreshold=16 -XX:MaxTenuringThreshold=16 -XX:InitiatingHeapOccupancyPercent=35 "; + private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", ""); /** @@ -88,7 +90,7 @@ public class PlanSpec { "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + - "if [[ \"${bamboo.version_tag}\" = \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml --no-transfer-progress clean install -Djava.security.egd=file:/dev/./urandom; fi\n" + + "if [[ \"${bamboo.version_tag}\" = \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml --no-transfer-progress clean install "+JVM_ARGS+" -Djava.security.egd=file:/dev/./urandom; fi\n" + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml --no-transfer-progress clean deploy -e -DdeployAtEnd=true -Dmaven.wagon.http.ssl.insecure=true -Dmaven.wagon.http.ssl.allowall=true -Dmaven.wagon.http.ssl.ignore.validity.dates=true -DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/red-platform-releases; fi\n" + "${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml package\n" + From 7af51f992abf83570ad053904f78ab1feeff7a72 Mon Sep 17 00:00:00 2001 From: Timo Date: Tue, 20 Apr 2021 10:32:11 +0300 Subject: [PATCH 05/10] set mvn opts --- bamboo-specs/src/main/java/buildjob/PlanSpec.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index e45c395b..71bac0d9 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -87,10 +87,12 @@ public class PlanSpec { .inlineBody("#!/bin/bash\n" + "set -e\n" + + "export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\" -e\n" + + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + - "if [[ \"${bamboo.version_tag}\" = \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml --no-transfer-progress clean install "+JVM_ARGS+" -Djava.security.egd=file:/dev/./urandom; fi\n" + + "if [[ \"${bamboo.version_tag}\" = \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml --no-transfer-progress clean install -Djava.security.egd=file:/dev/./urandom; fi\n" + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml --no-transfer-progress clean deploy -e -DdeployAtEnd=true -Dmaven.wagon.http.ssl.insecure=true -Dmaven.wagon.http.ssl.allowall=true -Dmaven.wagon.http.ssl.ignore.validity.dates=true -DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/red-platform-releases; fi\n" + "${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml package\n" + From e48e4e1797462473a0ef4e75d7e9987d8573249f Mon Sep 17 00:00:00 2001 From: Timo Date: Tue, 20 Apr 2021 10:32:54 +0300 Subject: [PATCH 06/10] updated redrect --- .../redaction/v1/server/redaction/model/RedRectangle2D.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java index d42b76ff..601d328c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java @@ -18,7 +18,7 @@ public class RedRectangle2D { @JsonIgnore public boolean isEmpty() { - return (width <= 0.0f) || (height <= 0.0f); + return width <= 0.0f || height <= 0.0f; } public boolean contains(double x, double y, double w, double h) { @@ -27,9 +27,9 @@ public class RedRectangle2D { } double x0 = getX(); double y0 = getY(); - return (x >= x0 && + return x >= x0 && y >= y0 && (x + w) <= x0 + getWidth() && - (y + h) <= y0 + getHeight()); + (y + h) <= y0 + getHeight(); } } From f1ce52a19a3b3273a4c5644c0e693ed10a0a8ca2 Mon Sep 17 00:00:00 2001 From: Timo Date: Tue, 20 Apr 2021 10:33:30 +0300 Subject: [PATCH 07/10] updated planspec --- bamboo-specs/src/main/java/buildjob/PlanSpec.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index 71bac0d9..75fb16c3 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -87,7 +87,7 @@ public class PlanSpec { .inlineBody("#!/bin/bash\n" + "set -e\n" + - "export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\" -e\n" + + "export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\"\n" + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + "if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" + From 385dff63ce816140bad20710e928e39eba622c6b Mon Sep 17 00:00:00 2001 From: Timo Date: Tue, 20 Apr 2021 10:37:53 +0300 Subject: [PATCH 08/10] fixed pmd --- .../redaction/v1/server/segmentation/PdfSegmentationService.java | 1 - 1 file changed, 1 deletion(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 80129054..21276881 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -59,7 +59,6 @@ public class PdfSegmentationService { pdDocument = reinitializePDDocument(tempFile, null); long pageCount = pdDocument.getNumberOfPages(); - long t1= System.currentTimeMillis(); for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { if (pageNumber % MAX_PAGES_BEFORE_GC == 0) { From 5b24c3a52c934cfe7fd0d2c89c39176ffab3bab8 Mon Sep 17 00:00:00 2001 From: Timo Date: Tue, 20 Apr 2021 10:56:19 +0300 Subject: [PATCH 09/10] fixed some test issues --- .../ObjectMapperConfiguration.java | 43 ------------------- .../controller/RedactionController.java | 4 +- .../segmentation/PdfSegmentationService.java | 23 +++++----- .../storage/RedactionStorageService.java | 2 +- 4 files changed, 16 insertions(+), 56 deletions(-) delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java deleted file mode 100644 index 44d3c2b7..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java +++ /dev/null @@ -1,43 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.configuration; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.core.Version; -import com.fasterxml.jackson.databind.JsonSerializer; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializerProvider; -import com.fasterxml.jackson.databind.module.SimpleModule; -import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; -import org.springframework.context.annotation.Bean; -import org.springframework.context.annotation.Configuration; -import org.springframework.context.annotation.Primary; - -import java.io.IOException; - -@Configuration -public class ObjectMapperConfiguration { - - - @Bean - @Primary - public ObjectMapper objectMapper() { - var objectMapper = new ObjectMapper(); - SimpleModule simpleModule = new SimpleModule("SimpleModule", - new Version(1, 0, 0, null)); -// simpleModule.addSerializer(new ItemSerializer()); - simpleModule.addSerializer(PDFont.class, new PDFontSerializer()); - simpleModule.addSerializer(PDTrueTypeFont.class, new PDFontSerializer()); - objectMapper.registerModule(simpleModule); - - return objectMapper; - } - - - public static class PDFontSerializer extends JsonSerializer { - - @Override - public void serialize(PDFont t, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException { - jsonGenerator.writeNull(); - } - } -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java index 250001a7..63a212b8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java @@ -59,7 +59,7 @@ public class RedactionController implements RedactionResource { try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { pdDocument.setAllSecurityToBeRemoved(true); - + dictionaryService.updateDictionary(redactionLog.getRuleSetId()); annotationService.annotate(pdDocument, redactionLog, sectionsGrid); @@ -131,7 +131,7 @@ public class RedactionController implements RedactionResource { try { var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); - classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); + classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true); } catch (Exception e) { throw new RedactionException(e); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 21276881..be4fa972 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -45,6 +45,10 @@ public class PdfSegmentationService { public Document parseDocument(InputStream documentInputStream) throws IOException { + return parseDocument(documentInputStream, false); + } + + public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException { PDDocument pdDocument = null; try { //create tempFile @@ -84,24 +88,23 @@ public class PdfSegmentationService { Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings .getVertical()); + page.setRotation(rotation); - - tableExtractionService.extractTables(cleanRulings, page); - - buildPageStatistics(page); - page.setLandscape(isLandscape || isRotated); - page.setPageNumber(pageNumber); - increaseDocumentStatistics(page, document); - page.setImages(stripper.getImages()); - imageClassificationService.classifyImages(page); + tableExtractionService.extractTables(cleanRulings, page); + buildPageStatistics(page); + increaseDocumentStatistics(page, document); + + + if (!ignoreImages) { + imageClassificationService.classifyImages(page); + } pages.add(page); - } document.setPages(pages); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java index c9792c0f..2c96bd05 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/storage/RedactionStorageService.java @@ -50,7 +50,7 @@ public class RedactionStorageService { try { return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class); } catch (IOException e) { - throw new RuntimeException("Could not convert Text", e); + throw new RuntimeException("Could not convert RedactionLog", e); } } From b34fc673c4d5a4440d6e5f2391db4420c4d2acf9 Mon Sep 17 00:00:00 2001 From: Timo Bejan Date: Tue, 20 Apr 2021 09:57:25 +0200 Subject: [PATCH 10/10] bamboo-specs/src/main/java/buildjob/PlanSpec.java edited online with Bitbucket --- bamboo-specs/src/main/java/buildjob/PlanSpec.java | 1 + 1 file changed, 1 insertion(+) diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index 75fb16c3..e2c7fe08 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -84,6 +84,7 @@ public class PlanSpec { .checkoutItems(new CheckoutItem().defaultRepository()), new ScriptTask() .description("Build") + .environmentVariables("MAVEN_OPTS="+JVM_ARGS) .inlineBody("#!/bin/bash\n" + "set -e\n" +