From 2b93ae57d5930ed4f4a9f4fa31438fa5ee3001de Mon Sep 17 00:00:00 2001 From: deiflaender Date: Thu, 13 Aug 2020 15:34:03 +0200 Subject: [PATCH] RED-88: Enabled manual redaction --- .../v1/model/ManualRedactionEntry.java | 23 ++ .../redaction/v1/model/ManualRedactions.java | 17 + .../redaction/v1/model/RedactionLogEntry.java | 5 + .../redaction/v1/model/RedactionRequest.java | 1 + .../redaction-service-server-v1/pom.xml | 5 + .../classification/model/Paragraph.java | 12 + .../controller/RedactionController.java | 4 +- .../parsing/model/TextPositionSequence.java | 85 ++++- .../model/EntityPositionSequence.java | 3 +- .../redaction/model/SearchableText.java | 4 +- .../service/EntityRedactionService.java | 32 +- .../v1/server/redaction/utils/IdBuilder.java | 31 ++ .../model/AbstractTextContainer.java | 6 + .../service/AnnotationHighlightService.java | 318 +++++++++++------- .../v1/server/RedactionIntegrationTest.java | 39 +++ .../service/EntityRedactionServiceTest.java | 2 +- 16 files changed, 441 insertions(+), 146 deletions(-) create mode 100644 redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactionEntry.java create mode 100644 redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactions.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactionEntry.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactionEntry.java new file mode 100644 index 00000000..06f5c7d8 --- /dev/null +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactionEntry.java @@ -0,0 +1,23 @@ +package com.iqser.red.service.redaction.v1.model; + +import java.util.ArrayList; +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@AllArgsConstructor +@NoArgsConstructor +public class ManualRedactionEntry { + + private String type; + private String value; + private String reason; + private List positions = new ArrayList<>(); + + private String section; + private int sectionNumber; + +} diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactions.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactions.java new file mode 100644 index 00000000..8e65ad9b --- /dev/null +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ManualRedactions.java @@ -0,0 +1,17 @@ +package com.iqser.red.service.redaction.v1.model; + +import java.util.HashSet; +import java.util.Set; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@AllArgsConstructor +@NoArgsConstructor +public class ManualRedactions { + + Set idsToRemove = new HashSet<>(); + Set entriesToAdd = new HashSet<>(); +} diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java index 90c6c263..6eb15ae5 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java @@ -3,9 +3,11 @@ package com.iqser.red.service.redaction.v1.model; import java.util.ArrayList; import java.util.List; +import lombok.Builder; import lombok.Data; @Data +@Builder public class RedactionLogEntry { private String id; @@ -16,7 +18,10 @@ public class RedactionLogEntry { private boolean isHint; private String section; private float[] color; + + @Builder.Default private List positions = new ArrayList<>(); private int sectionNumber; + private boolean manual; } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionRequest.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionRequest.java index 956c4a12..d61cfb13 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionRequest.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionRequest.java @@ -13,4 +13,5 @@ public class RedactionRequest { private byte[] document; private boolean flatRedaction; + private ManualRedactions manualRedactions; } diff --git a/redaction-service-v1/redaction-service-server-v1/pom.xml b/redaction-service-v1/redaction-service-server-v1/pom.xml index 3829ce86..0ed718a8 100644 --- a/redaction-service-v1/redaction-service-server-v1/pom.xml +++ b/redaction-service-v1/redaction-service-server-v1/pom.xml @@ -56,6 +56,11 @@ jts-core 1.16.1 + + com.google.guava + guava + 29.0-jre + com.iqser.gin4.commons diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java index 5f92dde0..081bc187 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java @@ -41,4 +41,16 @@ public class Paragraph { return tables; } + + public List getTextBlocks() { + + List textBlocks = new ArrayList<>(); + pageBlocks.forEach(block -> { + if (block instanceof TextBlock) { + textBlocks.add((TextBlock) block); + } + }); + return textBlocks; + } + } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java index baac0c9e..53495750 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java @@ -44,8 +44,8 @@ public class RedactionController implements RedactionResource { pdDocument.setAllSecurityToBeRemoved(true); Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc); - annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction()); + entityRedactionService.processDocument(classifiedDoc, redactionRequest.getManualRedactions()); + annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction(), redactionRequest.getManualRedactions()); if (redactionRequest.isFlatRedaction()) { PDDocument flatDocument = pdfFlattenService.flattenPDF(pdDocument); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java index 9b39e5fd..cf09ddfc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java @@ -5,6 +5,9 @@ import java.util.List; import org.apache.pdfbox.text.TextPosition; +import com.iqser.red.service.redaction.v1.model.Point; +import com.iqser.red.service.redaction.v1.model.Rectangle; + import lombok.Data; import lombok.Getter; import lombok.RequiredArgsConstructor; @@ -22,36 +25,48 @@ public class TextPositionSequence implements CharSequence { private final int page; - public TextPositionSequence(List textPositions, int page){ + + public TextPositionSequence(List textPositions, int page) { + this.textPositions = textPositions; this.page = page; } + @Override public int length() { + return textPositions.size(); } + @Override public char charAt(int index) { + TextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return text.charAt(0); } + public char charAt(int index, boolean caseInSensitive) { + TextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0); } + @Override public TextPositionSequence subSequence(int start, int end) { + return new TextPositionSequence(textPositions.subList(start, end), page); } + @Override public String toString() { + StringBuilder builder = new StringBuilder(length()); for (int i = 0; i < length(); i++) { builder.append(charAt(i)); @@ -59,15 +74,21 @@ public class TextPositionSequence implements CharSequence { return builder.toString(); } + public TextPosition textPositionAt(int index) { + return textPositions.get(index); } + public void add(TextPosition textPosition) { + this.textPositions.add(textPosition); } + public float getX1() { + if (textPositions.get(0).getRotation() == 90) { return textPositions.get(0).getYDirAdj() - getTextHeight(); } else { @@ -75,15 +96,20 @@ public class TextPositionSequence implements CharSequence { } } + public float getX2() { + if (textPositions.get(0).getRotation() == 90) { return textPositions.get(0).getYDirAdj(); } else { - return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1; + return textPositions.get(textPositions.size() - 1) + .getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1; } } + public float getY1() { + if (textPositions.get(0).getRotation() == 90) { return textPositions.get(0).getXDirAdj(); } else { @@ -91,30 +117,46 @@ public class TextPositionSequence implements CharSequence { } } + public float getY2() { + if (textPositions.get(0).getRotation() == 90) { - return textPositions.get(textPositions.size() - 1).getXDirAdj() + getTextHeight() -2 ; + return textPositions.get(textPositions.size() - 1).getXDirAdj() + getTextHeight() - 2; } else { return textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() + getTextHeight(); } } + public float getTextHeight() { + return textPositions.get(0).getHeightDir() + 2; } + public float getHeight() { + return getY2() - getY1(); } + public float getWidth() { + return getX2() - getX1(); } + public String getFont() { - return textPositions.get(0).getFont().toString().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", ""); + + return textPositions.get(0) + .getFont() + .toString() + .toLowerCase() + .replaceAll(",bold", "") + .replaceAll(",italic", ""); } + public String getFontStyle() { String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase(); @@ -131,16 +173,51 @@ public class TextPositionSequence implements CharSequence { } + public float getFontSize() { + return textPositions.get(0).getFontSizeInPt(); } + public float getSpaceWidth() { + return textPositions.get(0).getWidthOfSpace(); } + public int getRotation() { + return textPositions.get(0).getRotation(); } + + public Rectangle getRectangle() { + + float height = textPositions.get(0).getHeightDir() + 2; + + float posXInit; + float posXEnd; + float posYInit; + float posYEnd; + + if (textPositions.get(0).getRotation() == 90) { + + posXEnd = textPositions.get(0).getYDirAdj() + 2; + posXInit = textPositions.get(0).getYDirAdj() - height; + posYInit = textPositions.get(0).getXDirAdj(); + posYEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() - height + 4; + } else { + + posXInit = textPositions.get(0).getXDirAdj(); + posXEnd = textPositions.get(textPositions.size() - 1) + .getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1; + posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2; + posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1) + .getYDirAdj() + 2; + } + + return new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page); + } + } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/EntityPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/EntityPositionSequence.java index 96421f53..9bd0fb38 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/EntityPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/EntityPositionSequence.java @@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import java.util.ArrayList; import java.util.List; -import java.util.UUID; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; @@ -20,6 +19,6 @@ public class EntityPositionSequence { @EqualsAndHashCode.Exclude private List sequences = new ArrayList<>(); private int pageNumber; - private final UUID id; + private final String id; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index a8207e08..41ee4548 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -2,10 +2,10 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import java.util.ArrayList; import java.util.List; -import java.util.UUID; import java.util.regex.Pattern; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; public class SearchableText { @@ -114,7 +114,7 @@ public class SearchableText { private List buildEntityPositionSequence(List crossSequenceParts) { - UUID id = UUID.randomUUID(); + String id = IdBuilder.buildId(crossSequenceParts); List result = new ArrayList<>(); int currentPage = -1; EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index e5593157..a8e148cd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -12,6 +12,9 @@ import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Service; +import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; +import com.iqser.red.service.redaction.v1.model.ManualRedactions; +import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; @@ -34,7 +37,7 @@ public class EntityRedactionService { private final DroolsExecutionService droolsExecutionService; - public void processDocument(Document classifiedDoc) { + public void processDocument(Document classifiedDoc, ManualRedactions manualRedactions) { dictionaryService.updateDictionary(); droolsExecutionService.updateRules(); @@ -58,6 +61,7 @@ public class EntityRedactionService { continue; } cellValues.add(cell.getTextBlocks().get(0).getText()); + addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber); for (TextBlock textBlock : cell.getTextBlocks()) { searchableRow.addAll(textBlock.getSequences()); } @@ -80,6 +84,7 @@ public class EntityRedactionService { sectionNumber++; } + addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber); Set entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber); Section analysedSection = droolsExecutionService.executeRules(Section.builder() .entities(entities) @@ -103,8 +108,7 @@ public class EntityRedactionService { for (Map.Entry> entry : sequenceOnPage.entrySet()) { classifiedDoc.getEntities() .computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) - .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), - entity.getRedactionReason(), entry + .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber())); } } @@ -184,8 +188,7 @@ public class EntityRedactionService { if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { - found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, - headline, sectionNumber)); + found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber)); } } while (startIndex > -1); } @@ -213,4 +216,23 @@ public class EntityRedactionService { entities.removeAll(wordsToRemove); } + + private void addSectionToManualRedactions(List textBlocks, ManualRedactions manualRedactions, String section, int sectionNumber) { + + if (manualRedactions == null) { + return; + } + + for (TextBlock textBlock : textBlocks) { + for (ManualRedactionEntry manualRedactionEntry : manualRedactions.getEntriesToAdd()) { + for (Rectangle rectangle : manualRedactionEntry.getPositions()) { + if (textBlock.contains(rectangle)) { + manualRedactionEntry.setSection(section); + manualRedactionEntry.setSectionNumber(sectionNumber); + } + } + } + } + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java new file mode 100644 index 00000000..b159a194 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java @@ -0,0 +1,31 @@ +package com.iqser.red.service.redaction.v1.server.redaction.utils; + +import java.util.List; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class IdBuilder { + + private final HashFunction hashFunction = Hashing.murmur3_128(); + + public String buildId(List crossSequenceParts) { + + StringBuilder sb = new StringBuilder(); + crossSequenceParts.forEach(sequencePart -> sequencePart.getTextPositions().forEach(textPosition -> { + sb.append(textPosition.getTextMatrix()); + })); + + return hashFunction.hashBytes(sb.toString().getBytes()).toString(); + } + + + public String buildId(ManualRedactionEntry manualRedactionEntry) { + return hashFunction.hashBytes(manualRedactionEntry.toString().getBytes()).toString(); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java index 6cb155dd..b4e36e07 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java @@ -1,5 +1,7 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; +import com.iqser.red.service.redaction.v1.model.Rectangle; + import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; @@ -22,4 +24,8 @@ public abstract class AbstractTextContainer { return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY; } + public boolean contains(Rectangle other) { + return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight(); + } + } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/AnnotationHighlightService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/AnnotationHighlightService.java index baa7bf86..7355cf84 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/AnnotationHighlightService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/AnnotationHighlightService.java @@ -16,7 +16,8 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup; import org.springframework.stereotype.Service; -import com.iqser.red.service.redaction.v1.model.Point; +import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; +import com.iqser.red.service.redaction.v1.model.ManualRedactions; import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; import com.iqser.red.service.redaction.v1.server.classification.model.Document; @@ -26,6 +27,7 @@ import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSeque import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService; +import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; @@ -41,139 +43,171 @@ public class AnnotationHighlightService { private final DictionaryService dictionaryService; - public void highlight(PDDocument document, Document classifiedDoc, boolean flatRedaction) throws IOException { + public void highlight(PDDocument document, Document classifiedDoc, boolean flatRedaction, ManualRedactions manualRedactions) throws IOException { for (int page = 1; page <= document.getNumberOfPages(); page++) { PDPage pdPage = document.getPage(page - 1); - if (!flatRedaction) { - PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); - - for (Paragraph paragraph : classifiedDoc.getParagraphs()) { - - for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) { - - AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i); - - if (textBlock.getPage() != page) { - continue; - } - if (textBlock instanceof TextBlock) { - textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size()); - visualizeTextBlock((TextBlock) textBlock, contentStream); - } else if (textBlock instanceof Table) { - textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size()); - visualizeTable((Table) textBlock, contentStream); - } - - } - } - - contentStream.close(); - } + drawSectionFrames(document, classifiedDoc, flatRedaction, pdPage, page); if (classifiedDoc.getEntities().get(page) == null) { continue; } - for (Entity entity : classifiedDoc.getEntities().get(page)) { + addAnnotations(pdPage, classifiedDoc, flatRedaction, manualRedactions, page); + addManualAnnotations(pdPage, classifiedDoc, manualRedactions, page); + } + } - RedactionLogEntry redactionLogEntry = new RedactionLogEntry(); - for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + private void addAnnotations(PDPage pdPage, Document classifiedDoc, boolean flatRedaction, ManualRedactions manualRedactions, int page) throws IOException { - if (flatRedaction && !isRedactionType(entity)) { - continue; - } + List annotations = pdPage.getAnnotations(); - for (TextPositionSequence textPositions : entityPositionSequence.getSequences()) { + for (Entity entity : classifiedDoc.getEntities().get(page)) { - float height = textPositions.getTextPositions().get(0).getHeightDir() + 2; - - float posXInit; - float posXEnd; - float posYInit; - float posYEnd; - - if (textPositions.getTextPositions().get(0).getRotation() == 90) { - - posXEnd = textPositions.getTextPositions().get(0).getYDirAdj() + 2; - posXInit = textPositions.getTextPositions().get(0).getYDirAdj() - height; - posYInit = textPositions.getTextPositions().get(0).getXDirAdj(); - posYEnd = textPositions.getTextPositions() - .get(textPositions.getTextPositions().size() - 1) - .getXDirAdj() - height + 4; - } else { - - posXInit = textPositions.getTextPositions().get(0).getXDirAdj(); - posXEnd = textPositions.getTextPositions() - .get(textPositions.getTextPositions().size() - 1) - .getXDirAdj() + textPositions.getTextPositions() - .get(textPositions.getTextPositions().size() - 1) - .getWidth() + 1; - posYInit = textPositions.getTextPositions() - .get(0) - .getPageHeight() - textPositions.getTextPositions().get(0).getYDirAdj() - 2; - posYEnd = textPositions.getTextPositions() - .get(0) - .getPageHeight() - textPositions.getTextPositions() - .get(textPositions.getTextPositions().size() - 1) - .getYDirAdj() + 2; - } - - Rectangle textHighlightRectangle = new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page); - - List annotations = pdPage.getAnnotations(); - PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT); - highlight.constructAppearances(); - - PDRectangle annotationPosition = new PDRectangle(); - annotationPosition.setLowerLeftX(posXInit); - annotationPosition.setLowerLeftY(posYEnd); - annotationPosition.setUpperRightX(posXEnd); - annotationPosition.setUpperRightY(posYEnd + height); - - highlight.setRectangle(annotationPosition); - if (!flatRedaction && !isHint(entity)) { - highlight.setAnnotationName(entityPositionSequence.getId().toString()); - highlight.setTitlePopup(entityPositionSequence.getId().toString()); - highlight.setContents("\nRule " + entity.getMatchedRule() + " matched\n\n" + entity.getRedactionReason() + "\n\n" + "In Section : \"" + entity - .getHeadline() + "\""); - } - - highlight.setQuadPoints(toQuadPoints(textHighlightRectangle)); - - PDColor color; - if (flatRedaction) { - color = new PDColor(new float[]{0, 0, 0}, PDDeviceRGB.INSTANCE); - } else { - color = new PDColor(getColor(entity), PDDeviceRGB.INSTANCE); - } - - highlight.setColor(color); - annotations.add(highlight); - - redactionLogEntry.getPositions().add(textHighlightRectangle); - - } - redactionLogEntry.setId(entityPositionSequence.getId().toString()); - } - redactionLogEntry.setColor(getColor(entity)); - redactionLogEntry.setReason(entity.getRedactionReason()); - redactionLogEntry.setValue(entity.getWord()); - redactionLogEntry.setType(entity.getType()); - redactionLogEntry.setRedacted(entity.isRedaction()); - redactionLogEntry.setSection(entity.getHeadline()); - redactionLogEntry.setHint(isHint(entity)); - classifiedDoc.getRedactionLogEntities().add(redactionLogEntry); - redactionLogEntry.setSectionNumber(entity.getSectionNumber()); + if (flatRedaction && !isRedactionType(entity)) { + continue; } + RedactionLogEntry redactionLogEntry = createRedactionLogEntry(entity); + + for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + + if (manualRedactions != null && manualRedactions.getIdsToRemove() + .contains(entityPositionSequence.getId())) { + entity.setRedaction(false); + entity.setRedactionReason(entity.getRedactionReason() + ", removed by manual override"); + redactionLogEntry.setManual(true); + } + + for (TextPositionSequence textPositions : entityPositionSequence.getSequences()) { + + Rectangle rectangle = textPositions.getRectangle(); + redactionLogEntry.getPositions().add(rectangle); + annotations.add(createAnnotation(rectangle, entityPositionSequence.getId(), createAnnotationContent(entity), getColor(entity), !flatRedaction && !isHint(entity))); + } + redactionLogEntry.setId(entityPositionSequence.getId()); + } + classifiedDoc.getRedactionLogEntities().add(redactionLogEntry); } } + private void addManualAnnotations(PDPage pdPage, Document classifiedDoc, ManualRedactions manualRedactions, int page) throws IOException { + + if (manualRedactions == null) { + return; + } + + List annotations = pdPage.getAnnotations(); + + for (ManualRedactionEntry manualRedactionEntry : manualRedactions.getEntriesToAdd()) { + + String id = IdBuilder.buildId(manualRedactionEntry); + + RedactionLogEntry redactionLogEntry = createRedactionLogEntry(manualRedactionEntry); + + for (Rectangle rectangle : manualRedactionEntry.getPositions()) { + + if (page != rectangle.getPage()) { + continue; + } + + PDAnnotationTextMarkup highlight = createAnnotation(rectangle, id, createAnnotationContent(manualRedactionEntry), getColor(manualRedactionEntry + .getType()), true); + annotations.add(highlight); + + redactionLogEntry.getPositions().add(rectangle); + } + classifiedDoc.getRedactionLogEntities().add(redactionLogEntry); + } + } + + + private RedactionLogEntry createRedactionLogEntry(ManualRedactionEntry manualRedactionEntry) { + + return RedactionLogEntry.builder() + .color(getColor(manualRedactionEntry.getType())) + .reason(manualRedactionEntry.getReason()) + .value(manualRedactionEntry.getValue()) + .type(manualRedactionEntry.getType()) + .redacted(true) + .isHint(false) + .section(manualRedactionEntry.getSection()) + .sectionNumber(manualRedactionEntry.getSectionNumber()) + .manual(true) + .build(); + } + + + private RedactionLogEntry createRedactionLogEntry(Entity entity) { + + return RedactionLogEntry.builder() + .color(getColor(entity)) + .reason(entity.getRedactionReason()) + .value(entity.getWord()) + .type(entity.getType()) + .redacted(entity.isRedaction()) + .isHint(isHint(entity)) + .section(entity.getHeadline()) + .sectionNumber(entity.getSectionNumber()) + .build(); + } + + + private PDAnnotationTextMarkup createAnnotation(Rectangle rectangle, String id, String content, float[] color, boolean popup) { + + PDAnnotationTextMarkup annotation = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT); + annotation.constructAppearances(); + annotation.setRectangle(toPDRectangle(rectangle)); + annotation.setQuadPoints(toQuadPoints(rectangle)); + if (popup) { + annotation.setAnnotationName(id); + annotation.setTitlePopup(id); + annotation.setContents(content); + } + annotation.setColor(new PDColor(color, PDDeviceRGB.INSTANCE)); + return annotation; + } + + + private String createAnnotationContent(Entity entity) { + + return new StringBuilder().append("\nRule ") + .append(entity.getMatchedRule()) + .append(" matched") + .append("\n\n") + .append(entity.getRedactionReason()) + .append("\n\nIn Section : \"") + .append(entity.getHeadline()) + .append("\"") + .toString(); + } + + + private String createAnnotationContent(ManualRedactionEntry entry) { + + return new StringBuilder().append("\nManual Redaction") + .append("\n\nIn Section : \"") + .append(entry.getSection()) + .append("\"") + .toString(); + } + + + private PDRectangle toPDRectangle(Rectangle rectangle) { + + PDRectangle annotationPosition = new PDRectangle(); + annotationPosition.setLowerLeftX(rectangle.getTopLeft().getX()); + annotationPosition.setLowerLeftY(rectangle.getTopLeft().getY() + rectangle.getHeight()); + annotationPosition.setUpperRightX(rectangle.getTopLeft().getX() + rectangle.getWidth()); + annotationPosition.setUpperRightY(rectangle.getTopLeft().getY()); + return annotationPosition; + } + + private float[] toQuadPoints(Rectangle rectangle) { // quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right) @@ -202,15 +236,24 @@ public class AnnotationHighlightService { if (!entity.isRedaction() && !isHint(entity)) { return new float[]{0.627f, 0.627f, 0.627f}; } - if (!dictionaryService.getEntryColors().containsKey(entity.getType())) { return dictionaryService.getDefaultColor(); } - return dictionaryService.getEntryColors().get(entity.getType()); } + + private float[] getColor(String type) { + + if (!dictionaryService.getEntryColors().containsKey(type)) { + return dictionaryService.getDefaultColor(); + } + return dictionaryService.getEntryColors().get(type); + } + + private boolean isHint(Entity entity) { + List hintTypes = dictionaryService.getHintTypes(); if (CollectionUtils.isNotEmpty(hintTypes) && hintTypes.contains(entity.getType())) { return true; @@ -218,24 +261,49 @@ public class AnnotationHighlightService { return false; } + + private void drawSectionFrames(PDDocument document, Document classifiedDoc, boolean flatRedaction, PDPage pdPage, int page) throws IOException { + + if (flatRedaction) { + return; + } + + PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); + for (Paragraph paragraph : classifiedDoc.getParagraphs()) { + + for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) { + + AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i); + + if (textBlock.getPage() != page) { + continue; + } + if (textBlock instanceof TextBlock) { + textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size()); + visualizeTextBlock((TextBlock) textBlock, contentStream); + } else if (textBlock instanceof Table) { + textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size()); + visualizeTable((Table) textBlock, contentStream); + } + } + } + contentStream.close(); + } + + private void visualizeTextBlock(TextBlock textBlock, PDPageContentStream contentStream) throws IOException { contentStream.setStrokingColor(Color.LIGHT_GRAY); contentStream.setLineWidth(0.5f); - contentStream.addRect(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight()); contentStream.stroke(); if (textBlock.getClassification() != null) { contentStream.beginText(); - contentStream.setNonStrokingColor(Color.DARK_GRAY); contentStream.setFont(PDType1Font.TIMES_ROMAN, 8f); - contentStream.newLineAtOffset(textBlock.getMinX(), textBlock.getMaxY()); - contentStream.showText(textBlock.getClassification()); - contentStream.endText(); } } @@ -252,26 +320,16 @@ public class AnnotationHighlightService { contentStream.addRect((float) cell.getX(), (float) cell.getY(), (float) cell.getWidth(), (float) cell .getHeight()); contentStream.stroke(); - -// contentStream.setStrokingColor(Color.GREEN); -// for (TextBlock textBlock : cell.getTextBlocks()) { -// contentStream.addRect(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight()); -// contentStream.stroke(); -// } } } } if (table.getClassification() != null) { contentStream.beginText(); - contentStream.setNonStrokingColor(Color.DARK_GRAY); contentStream.setFont(PDType1Font.TIMES_ROMAN, 8f); - contentStream.newLineAtOffset(table.getMinX(), table.getMinY()); - contentStream.showText(table.getClassification()); - contentStream.endText(); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 1683b86f..04003f7f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -17,6 +17,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -41,6 +42,10 @@ import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse; import com.iqser.red.service.configuration.v1.api.model.RulesResponse; import com.iqser.red.service.configuration.v1.api.model.TypeResponse; import com.iqser.red.service.configuration.v1.api.model.TypeResult; +import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; +import com.iqser.red.service.redaction.v1.model.ManualRedactions; +import com.iqser.red.service.redaction.v1.model.Point; +import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.model.RedactionRequest; import com.iqser.red.service.redaction.v1.model.RedactionResult; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; @@ -303,6 +308,40 @@ public class RedactionIntegrationTest { } + @Test + public void testManualRedaction() throws IOException { + + long start = System.currentTimeMillis(); + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf"); + + ManualRedactions manualRedactions = new ManualRedactions(); + manualRedactions.setIdsToRemove(Set.of("0836727c3508a0b2ea271da69c04cc2f")); + + ManualRedactionEntry manualRedactionEntry = new ManualRedactionEntry(); + manualRedactionEntry.setType("name"); + manualRedactionEntry.setValue("O'Loughlin C.K."); + manualRedactionEntry.setReason("Manual Redaction"); + manualRedactionEntry.setPositions(List.of(new Rectangle(new Point(375.61096f, 241.282f), 7.648041f, 43.72262f, 1), new Rectangle(new Point(384.83517f, 241.282f), 7.648041f, 17.043358f, 1))); + + manualRedactions.getEntriesToAdd().add(manualRedactionEntry); + + RedactionRequest request = RedactionRequest.builder() + .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + .manualRedactions(manualRedactions) + .build(); + + RedactionResult result = redactionController.redact(request); + + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Redacted.pdf")) { + fileOutputStream.write(result.getDocument()); + } + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + System.out.println("numberOfPages: " + result.getNumberOfPages()); + } + + @Test public void classificationTest() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index 9d2471f1..ed2999c6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -147,7 +147,7 @@ public class EntityRedactionServiceTest { when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor()); try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc); + entityRedactionService.processDocument(classifiedDoc, null); assertThat(classifiedDoc.getEntities()).hasSize(1); // one page assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1 }