diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java index 61d12a43..b88a16b7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +13,7 @@ public class Footer { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java index f3067452..133e0245 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +13,7 @@ public class Header { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java index f62091a7..77649132 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; import com.iqser.red.service.redaction.v1.server.redaction.model.Image; @@ -36,7 +37,7 @@ public class SectionText { this.tabularData = tabularData; } - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java index d40f8671..63cfc11c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; @@ -118,6 +119,7 @@ public class TextBlock extends AbstractTextContainer { } @Override + @JsonIgnore public String getText() { StringBuilder sb = new StringBuilder(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java index 79277b9e..0d51a4f8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.classification.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +13,7 @@ public class UnclassifiedText { private List textBlocks; - + @JsonIgnore public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java deleted file mode 100644 index f58c7475..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.parsing.model; - -import lombok.Data; - - -@Data -public class RedMatrix { - - private float[] single; -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java index effafad8..04e394f8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java @@ -1,35 +1,35 @@ package com.iqser.red.service.redaction.v1.server.parsing.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import lombok.Data; import lombok.SneakyThrows; import org.apache.pdfbox.text.TextPosition; -import org.apache.pdfbox.util.Matrix; import org.springframework.beans.BeanUtils; @Data public class RedTextPosition { - private Matrix textMatrix; - private float endX; - private float endY; - private float maxHeight; + private String textMatrix; private int rotation; - private float x; private float y; private float pageHeight; private float pageWidth; - private float widthOfSpace; - private int[] charCodes; - private float fontSize; - private float fontSizePt; - private float[] widths; private String unicode; - private float direction = -1.0F; private float XDirAdj; private float YDirAdj; private float width; private float heightDir; + + // not used in reanalysis + @JsonIgnore + private float widthOfSpace; + + // not used in reanalysis + @JsonIgnore private float fontSizeInPt; + + // not used in reanalysis + @JsonIgnore private String fontName; @@ -39,10 +39,12 @@ public class RedTextPosition { BeanUtils.copyProperties(textPosition, pos); pos.setFontName(textPosition.getFont().getName()); - pos.setCharCodes(textPosition.getCharacterCodes()); - pos.setWidths(textPosition.getIndividualWidths()); - pos.setFontSizePt(textPosition.getFontSizeInPt()); + pos.setFontSizeInPt(textPosition.getFontSizeInPt()); + + pos.setTextMatrix(textPosition.getTextMatrix().toString()); return pos; } + + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java index 786e1483..10b5abb1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java @@ -1,5 +1,7 @@ package com.iqser.red.service.redaction.v1.server.parsing.model; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; import lombok.Data; @@ -12,11 +14,14 @@ import java.util.stream.Collectors; @Data @NoArgsConstructor +@JsonIgnoreProperties({ "empty" }) public class TextPositionSequence implements CharSequence { private int page; private List textPositions = new ArrayList<>(); + private float x1; + private float x2; public TextPositionSequence(int page) { this.page = page; @@ -38,9 +43,8 @@ public class TextPositionSequence implements CharSequence { this.page = page; } - public void setTextPositions(List textPositions) { - this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); - } + + @Override @@ -103,6 +107,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getX1() { if (textPositions.get(0).getRotation() == 90) { @@ -113,6 +118,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getX2() { if (textPositions.get(0).getRotation() == 90) { @@ -123,13 +129,14 @@ public class TextPositionSequence implements CharSequence { } } - + @JsonIgnore public float getRotationAdjustedY() { return textPositions.get(0).getY(); } + @JsonIgnore public float getY1() { if (textPositions.get(0).getRotation() == 90) { @@ -140,6 +147,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getY2() { if (textPositions.get(0).getRotation() == 90) { @@ -150,26 +158,29 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public float getTextHeight() { return textPositions.get(0).getHeightDir() + 2; } + @JsonIgnore public float getHeight() { return getY2() - getY1(); } + @JsonIgnore public float getWidth() { return getX2() - getX1(); } + @JsonIgnore public String getFont() { - return textPositions.get(0).getFontName() .toLowerCase() .replaceAll(",bold", "") @@ -177,6 +188,7 @@ public class TextPositionSequence implements CharSequence { } + @JsonIgnore public String getFontStyle() { String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); @@ -193,25 +205,25 @@ public class TextPositionSequence implements CharSequence { } - + @JsonIgnore public float getFontSize() { return textPositions.get(0).getFontSizeInPt(); } - + @JsonIgnore public float getSpaceWidth() { return textPositions.get(0).getWidthOfSpace(); } - + @JsonIgnore public int getRotation() { return textPositions.get(0).getRotation(); } - + @JsonIgnore public Rectangle getRectangle() { float height = getTextHeight(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java index e4e6167a..766d607d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java @@ -5,8 +5,6 @@ import lombok.Builder; import lombok.Data; import lombok.NoArgsConstructor; -import java.awt.geom.Rectangle2D; - @Data @Builder @NoArgsConstructor @@ -14,7 +12,7 @@ import java.awt.geom.Rectangle2D; public class Image { private String type; - private Rectangle2D position; + private RedRectangle2D position; private boolean redaction; private String redactionReason; private String legalBasis; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java index f7f6ad4f..1631717f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import lombok.Data; import lombok.NonNull; import lombok.RequiredArgsConstructor; @@ -11,9 +12,10 @@ import java.awt.image.BufferedImage; @RequiredArgsConstructor public class PdfImage { + @JsonIgnore private BufferedImage image; @NonNull - private Rectangle2D position; + private RedRectangle2D position; private ImageType imageType; private boolean isAppendedToParagraph; @@ -22,7 +24,7 @@ public class PdfImage { public PdfImage(BufferedImage image, Rectangle2D position, int page) { this.image = image; - this.position = position; + this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight()); this.page = page; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java new file mode 100644 index 00000000..d42b76ff --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java @@ -0,0 +1,35 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + + +import com.fasterxml.jackson.annotation.JsonIgnore; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +@AllArgsConstructor +public class RedRectangle2D { + + private double x; + private double y; + private double width; + private double height; + + @JsonIgnore + public boolean isEmpty() { + return (width <= 0.0f) || (height <= 0.0f); + } + + public boolean contains(double x, double y, double w, double h) { + if (isEmpty() || w <= 0 || h <= 0) { + return false; + } + double x0 = getX(); + double y0 = getY(); + return (x >= x0 && + y >= y0 && + (x + w) <= x0 + getWidth() && + (y + h) <= y0 + getHeight()); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 89ccf4a4..5ee4cb3f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -187,6 +187,7 @@ public class EntityRedactionService { .get(0) .getPage()); sectionText.getSectionAreas().add(sectionArea); + sectionText.getTextBlocks().addAll(cell.getTextBlocks()); addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue()); int cellStart = start; @@ -235,6 +236,8 @@ public class EntityRedactionService { sectionText.setHeadline(table.getHeadline()); sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setTable(true); + sectionText.setTabularData(tabularData); + sectionText.setCellStarts(cellStarts); classifiedDoc.getSectionText().add(sectionText); } @@ -267,6 +270,7 @@ public class EntityRedactionService { .getSequences() .get(0) .getPage()); + sectionText.getTextBlocks().addAll(cell.getTextBlocks()); sectionText.getSectionAreas().add(sectionArea); } @@ -325,6 +329,10 @@ public class EntityRedactionService { sectionText.setHeadline(headline); sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setTable(false); + sectionText.setImages(images.stream() + .map(image -> convert(image, sectionNumber.intValue(), headline)) + .collect(Collectors.toSet())); + sectionText.setTextBlocks(paragraphTextBlocks); classifiedDoc.getSectionText().add(sectionText); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index d610e94e..90c964fb 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -12,15 +12,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import org.apache.pdfbox.io.MemoryUsageSetting; -import org.apache.pdfbox.pdmodel.PDDocument; import org.kie.api.runtime.KieContainer; import org.springframework.stereotype.Service; import org.springframework.web.bind.annotation.RequestBody; -import java.awt.geom.Rectangle2D; -import java.io.IOException; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -42,7 +39,6 @@ public class ReanalyzeService { public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) { - var pageCount = 0; Document classifiedDoc; @@ -56,18 +52,6 @@ public class ReanalyzeService { log.info("Document structure analysis successful, starting redaction analysis..."); entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions()); - - var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN)); - - // TODO move this to where it makes sense - or remove completly - try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { - pdDocument.setAllSecurityToBeRemoved(true); - pdfSegmentationService.postProcessSections(pdDocument, classifiedDoc.getSectionText()); - } catch (IOException e) { - e.printStackTrace(); - } - - redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest .getRuleSetId()); @@ -89,30 +73,28 @@ public class ReanalyzeService { return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog); } - public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) { - var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId()); - // new procedure was not applied, we need a complete analysis + + @SneakyThrows + public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) { + + var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); + var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId()); + + // not yet ready for reanalysis if (text.getNumberOfPages() == 0) { - return analyze(AnalyzeRequest.builder() - .ruleSetId(renalyzeRequest.getRuleSetId()) - .manualRedactions(renalyzeRequest.getManualRedactions()) - .projectId(renalyzeRequest.getProjectId()) - .fileId(renalyzeRequest.getFileId()) - .build()); + return analyze(analyzeRequest); } - var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId()); + DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion()); - DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion()); - - Set manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions()); + Set manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions()); Map> comments = null; Set manualAdds = null; - if (renalyzeRequest.getManualRedactions() != null) { + if (analyzeRequest.getManualRedactions() != null) { // TODO comments will be removed from redactionLog, so we ignore this first. - comments = renalyzeRequest.getManualRedactions().getComments(); - manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd(); + comments = analyzeRequest.getManualRedactions().getComments(); + manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd(); } Set sectionsToReanalyse = new HashSet<>(); @@ -146,115 +128,113 @@ public class ReanalyzeService { } } + if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) { redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); - var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog); - redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); + var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog); + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); } - try { + List reanalysisSections = new ArrayList<>(); - List reanalysisSections = new ArrayList<>(); - for (SectionText sectionText : text.getSectionTexts()) { - - if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) { - reanalysisSections.add(sectionText); - } + for (SectionText sectionText : text.getSectionTexts()) { + if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) { + reanalysisSections.add(sectionText); } - - KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId()); - - Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId()); - - List sectionSearchableTextPairs = new ArrayList<>(); - for (SectionText reanalysisSection : reanalysisSections) { - - Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection - .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); - if (reanalysisSection.getCellStarts() != null) { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection - .getCellStarts()); - } else { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); - } - - sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() - .isLocal(false) - .dictionaryTypes(dictionary.getTypes()) - .entities(entities) - .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) - .searchText(reanalysisSection.getSearchableText().toString()) - .headline(reanalysisSection.getHeadline()) - .sectionNumber(reanalysisSection.getSectionNumber()) - .tabularData(reanalysisSection.getTabularData()) - .searchableText(reanalysisSection.getSearchableText()) - .dictionary(dictionary) - .images(reanalysisSection.getImages()) - .build(), reanalysisSection.getSearchableText())); - } - - Set entities = new HashSet<>(); - Map> imagesPerPage = new HashMap<>(); - sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { - Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair - .getSection()); - entities.addAll(analysedRowSection.getEntities()); - EntitySearchUtils.removeEntitiesContainedInLarger(entities); - - for (Image image : analysedRowSection.getImages()) { - imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); - } - - }); - - Map> entitiesPerPage = new HashMap<>(); - for (Entity entity : entities) { - Map> sequenceOnPage = new HashMap<>(); - for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { - sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) - .add(entityPositionSequence); - } - - for (Map.Entry> entry : sequenceOnPage.entrySet()) { - entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) - .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry - .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity - .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity - .getStart(), entity.getEnd())); - } - } - - List newRedactionLogEntries = new ArrayList<>(); - for (int page = 1; page <= text.getNumberOfPages(); page++) { - if (entitiesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest - .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); - } - - if (imagesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest - .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); - } - - newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest - .getRuleSetId())); - } - - redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()); - redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); - redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); - - var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog); - redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); - return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); - - - } catch (Exception e) { - throw new RedactionException(e); } + + //-- + + KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId()); + + Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId()); + + List sectionSearchableTextPairs = new ArrayList<>(); + for (SectionText reanalysisSection : reanalysisSections) { + + Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection + .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); + if (reanalysisSection.getCellStarts() != null) { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection + .getCellStarts()); + } else { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); + } + + sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() + .isLocal(false) + .dictionaryTypes(dictionary.getTypes()) + .entities(entities) + .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) + .searchText(reanalysisSection.getSearchableText().toString()) + .headline(reanalysisSection.getHeadline()) + .sectionNumber(reanalysisSection.getSectionNumber()) + .tabularData(reanalysisSection.getTabularData()) + .searchableText(reanalysisSection.getSearchableText()) + .dictionary(dictionary) + .images(reanalysisSection.getImages()) + .build(), reanalysisSection.getSearchableText())); + } + + Set entities = new HashSet<>(); + Map> imagesPerPage = new HashMap<>(); + sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { + Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair + .getSection()); + entities.addAll(analysedRowSection.getEntities()); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); + + for (Image image : analysedRowSection.getImages()) { + imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); + } + + }); + + Map> entitiesPerPage = new HashMap<>(); + for (Entity entity : entities) { + Map> sequenceOnPage = new HashMap<>(); + for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) + .add(entityPositionSequence); + } + + for (Map.Entry> entry : sequenceOnPage.entrySet()) { + entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) + .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry + .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity + .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity + .getStart(), entity.getEnd())); + } + } + + List newRedactionLogEntries = new ArrayList<>(); + for (int page = 1; page <= text.getNumberOfPages(); page++) { + if (entitiesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest + .getManualRedactions(), page, analyzeRequest.getRuleSetId())); + } + + if (imagesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest + .getManualRedactions(), page, analyzeRequest.getRuleSetId())); + } + + newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest + .getRuleSetId())); + } + + + redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()); + redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); + redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); + + var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog); + redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); + return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); + } @@ -277,7 +257,7 @@ public class ReanalyzeService { return Image.builder() .type(entry.getType()) - .position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft() + .position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft() .getY(), position.getWidth(), position.getHeight())) .sectionNumber(entry.getSectionNumber()) .section(entry.getSection()) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java index 34a712fe..241aa1be 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java @@ -3,9 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D; import lombok.experimental.UtilityClass; -import java.awt.geom.Rectangle2D; import java.nio.charset.StandardCharsets; import java.util.List; @@ -25,12 +25,8 @@ public class IdBuilder { } - public String buildId(Rectangle2D rectangle2D, int page) { - - StringBuilder sb = new StringBuilder(); - sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page); - - return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString(); + public String buildId(RedRectangle2D rectangle2D, int page) { + return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 12fc519a..80129054 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -1,21 +1,15 @@ package com.iqser.red.service.redaction.v1.server.segmentation; -import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService; -import com.iqser.red.service.redaction.v1.server.exception.RedactionException; import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; -import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; @@ -28,15 +22,12 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; -import java.awt.geom.Rectangle2D; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; @Slf4j @Service @@ -53,79 +44,6 @@ public class PdfSegmentationService { private final ImageClassificationService imageClassificationService; - public void postProcessSections(PDDocument pdDocument, List texts) { - - try { - for (SectionText sectionText : texts) { - - List textBlocks = new ArrayList<>(); - - Map> sectionAreasPerPage = new HashMap<>(); - for (SectionArea sectionArea : sectionText.getSectionAreas()) { - sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>()) - .add(sectionArea); - } - - Map tabularData = new HashMap<>(); - List cellStarts = new ArrayList<>(); - for (Integer page : sectionAreasPerPage.keySet()) { - List areasOnPage = sectionAreasPerPage.get(page); - - PDPage pdPage = pdDocument.getPage(page - 1); - PDRectangle cropBox = pdPage.getCropBox(); - PDFAreaTextStripper textStripper = new PDFAreaTextStripper(); - textStripper.setPageNumber(page); - - int cellStart = 0; - for (SectionArea sectionArea : areasOnPage) { - - Rectangle2D rect = null; - if (pdPage.getRotation() == 90) { - rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft() - .getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f); - } else { - rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft() - .getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea - .getHeight() + 0.001f); - } - - textStripper.addRegion(String.valueOf(1), rect); - textStripper.extractRegions(pdPage); - textStripper.getTextForRegion(String.valueOf(1)); - List positions = textStripper.getTextPositionSequences(); - - TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft() - .getX() + sectionArea.getWidth(), sectionArea.getTopLeft() - .getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0); - - if (sectionText.isTable()) { - Cell cell = new Cell(); - cell.addTextBlock(textBlock); - tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart)); - cellStarts.add(cellStart); - cellStart = cellStart + cell.toString().trim().length() + 1; - } - - textBlocks.add(textBlock); - textStripper.clearPositions(); - } - - } - sectionText.setTextBlocks(textBlocks); - sectionText.setTabularData(tabularData); - if (sectionText.isTable()) { - sectionText.setCellStarts(cellStarts); - } - } - - - } catch (Exception e) { - throw new RedactionException(e); - } - - } - - public Document parseDocument(InputStream documentInputStream) throws IOException { PDDocument pdDocument = null; try { @@ -141,6 +59,7 @@ public class PdfSegmentationService { pdDocument = reinitializePDDocument(tempFile, null); long pageCount = pdDocument.getNumberOfPages(); + long t1= System.currentTimeMillis(); for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { if (pageNumber % MAX_PAGES_BEFORE_GC == 0) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java index 2f6183ab..b050e27b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.model.Rectangle; import lombok.AllArgsConstructor; import lombok.Data; @@ -27,10 +28,12 @@ public abstract class AbstractTextContainer { return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight(); } + @JsonIgnore public float getHeight() { return maxY - minY; } - + + @JsonIgnore public float getWidth() { return maxX - minX; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java index cff5698f..9cc8537f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/FilySystemBackedStorageService.java @@ -44,7 +44,7 @@ public class FilySystemBackedStorageService extends StorageService { public void clearStorage() { this.dataMap.forEach((k, v) -> { - v.delete(); + // v.delete(); }); this.dataMap.clear(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 2a998b14..7200b7a7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -458,6 +458,16 @@ public class RedactionIntegrationTest { assertThat(result).isNotNull(); } + @Test + public void testXXX() { + AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf"); + MemoryStats.printMemoryStats(); + AnalyzeResult result = redactionController.analyze(request); + assertThat(result).isNotNull(); + } + + + @Test public void noExceptionShouldBeThrownForAnyFiles() throws IOException {