RED-88: Enabled manual redaction

This commit is contained in:
deiflaender 2020-08-13 15:34:03 +02:00
parent 954765759c
commit 2b93ae57d5
16 changed files with 441 additions and 146 deletions

View File

@ -0,0 +1,23 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class ManualRedactionEntry {
private String type;
private String value;
private String reason;
private List<Rectangle> positions = new ArrayList<>();
private String section;
private int sectionNumber;
}

View File

@ -0,0 +1,17 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.HashSet;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class ManualRedactions {
Set<String> idsToRemove = new HashSet<>();
Set<ManualRedactionEntry> entriesToAdd = new HashSet<>();
}

View File

@ -3,9 +3,11 @@ package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
import lombok.Builder;
import lombok.Data;
@Data
@Builder
public class RedactionLogEntry {
private String id;
@ -16,7 +18,10 @@ public class RedactionLogEntry {
private boolean isHint;
private String section;
private float[] color;
@Builder.Default
private List<Rectangle> positions = new ArrayList<>();
private int sectionNumber;
private boolean manual;
}

View File

@ -13,4 +13,5 @@ public class RedactionRequest {
private byte[] document;
private boolean flatRedaction;
private ManualRedactions manualRedactions;
}

View File

@ -56,6 +56,11 @@
<artifactId>jts-core</artifactId>
<version>1.16.1</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>29.0-jre</version>
</dependency>
<!-- commons -->
<dependency>
<groupId>com.iqser.gin4.commons</groupId>

View File

@ -41,4 +41,16 @@ public class Paragraph {
return tables;
}
public List<TextBlock> getTextBlocks() {
List<TextBlock> textBlocks = new ArrayList<>();
pageBlocks.forEach(block -> {
if (block instanceof TextBlock) {
textBlocks.add((TextBlock) block);
}
});
return textBlocks;
}
}

View File

@ -44,8 +44,8 @@ public class RedactionController implements RedactionResource {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc);
annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction());
entityRedactionService.processDocument(classifiedDoc, redactionRequest.getManualRedactions());
annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction(), redactionRequest.getManualRedactions());
if (redactionRequest.isFlatRedaction()) {
PDDocument flatDocument = pdfFlattenService.flattenPDF(pdDocument);

View File

@ -5,6 +5,9 @@ import java.util.List;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
@ -22,36 +25,48 @@ public class TextPositionSequence implements CharSequence {
private final int page;
public TextPositionSequence(List<TextPosition> textPositions, int page){
public TextPositionSequence(List<TextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.page = page;
}
@Override
public int length() {
return textPositions.size();
}
@Override
public char charAt(int index) {
TextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
public char charAt(int index, boolean caseInSensitive) {
TextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@Override
public TextPositionSequence subSequence(int start, int end) {
return new TextPositionSequence(textPositions.subList(start, end), page);
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder(length());
for (int i = 0; i < length(); i++) {
builder.append(charAt(i));
@ -59,15 +74,21 @@ public class TextPositionSequence implements CharSequence {
return builder.toString();
}
public TextPosition textPositionAt(int index) {
return textPositions.get(index);
}
public void add(TextPosition textPosition) {
this.textPositions.add(textPosition);
}
public float getX1() {
if (textPositions.get(0).getRotation() == 90) {
return textPositions.get(0).getYDirAdj() - getTextHeight();
} else {
@ -75,15 +96,20 @@ public class TextPositionSequence implements CharSequence {
}
}
public float getX2() {
if (textPositions.get(0).getRotation() == 90) {
return textPositions.get(0).getYDirAdj();
} else {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1;
return textPositions.get(textPositions.size() - 1)
.getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1;
}
}
public float getY1() {
if (textPositions.get(0).getRotation() == 90) {
return textPositions.get(0).getXDirAdj();
} else {
@ -91,30 +117,46 @@ public class TextPositionSequence implements CharSequence {
}
}
public float getY2() {
if (textPositions.get(0).getRotation() == 90) {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + getTextHeight() -2 ;
return textPositions.get(textPositions.size() - 1).getXDirAdj() + getTextHeight() - 2;
} else {
return textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() + getTextHeight();
}
}
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + 2;
}
public float getHeight() {
return getY2() - getY1();
}
public float getWidth() {
return getX2() - getX1();
}
public String getFont() {
return textPositions.get(0).getFont().toString().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
return textPositions.get(0)
.getFont()
.toString()
.toLowerCase()
.replaceAll(",bold", "")
.replaceAll(",italic", "");
}
public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase();
@ -131,16 +173,51 @@ public class TextPositionSequence implements CharSequence {
}
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
public int getRotation() {
return textPositions.get(0).getRotation();
}
public Rectangle getRectangle() {
float height = textPositions.get(0).getHeightDir() + 2;
float posXInit;
float posXEnd;
float posYInit;
float posYEnd;
if (textPositions.get(0).getRotation() == 90) {
posXEnd = textPositions.get(0).getYDirAdj() + 2;
posXInit = textPositions.get(0).getYDirAdj() - height;
posYInit = textPositions.get(0).getXDirAdj();
posYEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() - height + 4;
} else {
posXInit = textPositions.get(0).getXDirAdj();
posXEnd = textPositions.get(textPositions.size() - 1)
.getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1;
posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2;
posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1)
.getYDirAdj() + 2;
}
return new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
}
}

View File

@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
@ -20,6 +19,6 @@ public class EntityPositionSequence {
@EqualsAndHashCode.Exclude
private List<TextPositionSequence> sequences = new ArrayList<>();
private int pageNumber;
private final UUID id;
private final String id;
}

View File

@ -2,10 +2,10 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.regex.Pattern;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
public class SearchableText {
@ -114,7 +114,7 @@ public class SearchableText {
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts) {
UUID id = UUID.randomUUID();
String id = IdBuilder.buildId(crossSequenceParts);
List<EntityPositionSequence> result = new ArrayList<>();
int currentPage = -1;
EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id);

View File

@ -12,6 +12,9 @@ import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
@ -34,7 +37,7 @@ public class EntityRedactionService {
private final DroolsExecutionService droolsExecutionService;
public void processDocument(Document classifiedDoc) {
public void processDocument(Document classifiedDoc, ManualRedactions manualRedactions) {
dictionaryService.updateDictionary();
droolsExecutionService.updateRules();
@ -58,6 +61,7 @@ public class EntityRedactionService {
continue;
}
cellValues.add(cell.getTextBlocks().get(0).getText());
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
for (TextBlock textBlock : cell.getTextBlocks()) {
searchableRow.addAll(textBlock.getSequences());
}
@ -80,6 +84,7 @@ public class EntityRedactionService {
sectionNumber++;
}
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber);
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber);
Section analysedSection = droolsExecutionService.executeRules(Section.builder()
.entities(entities)
@ -103,8 +108,7 @@ public class EntityRedactionService {
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
classifiedDoc.getEntities()
.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(),
entity.getRedactionReason(), entry
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber()));
}
}
@ -184,8 +188,7 @@ public class EntityRedactionService {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex,
headline, sectionNumber));
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber));
}
} while (startIndex > -1);
}
@ -213,4 +216,23 @@ public class EntityRedactionService {
entities.removeAll(wordsToRemove);
}
private void addSectionToManualRedactions(List<TextBlock> textBlocks, ManualRedactions manualRedactions, String section, int sectionNumber) {
if (manualRedactions == null) {
return;
}
for (TextBlock textBlock : textBlocks) {
for (ManualRedactionEntry manualRedactionEntry : manualRedactions.getEntriesToAdd()) {
for (Rectangle rectangle : manualRedactionEntry.getPositions()) {
if (textBlock.contains(rectangle)) {
manualRedactionEntry.setSection(section);
manualRedactionEntry.setSectionNumber(sectionNumber);
}
}
}
}
}
}

View File

@ -0,0 +1,31 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.util.List;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class IdBuilder {
private final HashFunction hashFunction = Hashing.murmur3_128();
public String buildId(List<TextPositionSequence> crossSequenceParts) {
StringBuilder sb = new StringBuilder();
crossSequenceParts.forEach(sequencePart -> sequencePart.getTextPositions().forEach(textPosition -> {
sb.append(textPosition.getTextMatrix());
}));
return hashFunction.hashBytes(sb.toString().getBytes()).toString();
}
public String buildId(ManualRedactionEntry manualRedactionEntry) {
return hashFunction.hashBytes(manualRedactionEntry.toString().getBytes()).toString();
}
}

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -22,4 +24,8 @@ public abstract class AbstractTextContainer {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
public boolean contains(Rectangle other) {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
}

View File

@ -16,7 +16,8 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
@ -26,6 +27,7 @@ import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSeque
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@ -41,139 +43,171 @@ public class AnnotationHighlightService {
private final DictionaryService dictionaryService;
public void highlight(PDDocument document, Document classifiedDoc, boolean flatRedaction) throws IOException {
public void highlight(PDDocument document, Document classifiedDoc, boolean flatRedaction, ManualRedactions manualRedactions) throws IOException {
for (int page = 1; page <= document.getNumberOfPages(); page++) {
PDPage pdPage = document.getPage(page - 1);
if (!flatRedaction) {
PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i);
if (textBlock.getPage() != page) {
continue;
}
if (textBlock instanceof TextBlock) {
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
visualizeTextBlock((TextBlock) textBlock, contentStream);
} else if (textBlock instanceof Table) {
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
visualizeTable((Table) textBlock, contentStream);
}
}
}
contentStream.close();
}
drawSectionFrames(document, classifiedDoc, flatRedaction, pdPage, page);
if (classifiedDoc.getEntities().get(page) == null) {
continue;
}
for (Entity entity : classifiedDoc.getEntities().get(page)) {
addAnnotations(pdPage, classifiedDoc, flatRedaction, manualRedactions, page);
addManualAnnotations(pdPage, classifiedDoc, manualRedactions, page);
}
}
RedactionLogEntry redactionLogEntry = new RedactionLogEntry();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
private void addAnnotations(PDPage pdPage, Document classifiedDoc, boolean flatRedaction, ManualRedactions manualRedactions, int page) throws IOException {
if (flatRedaction && !isRedactionType(entity)) {
continue;
}
List<PDAnnotation> annotations = pdPage.getAnnotations();
for (TextPositionSequence textPositions : entityPositionSequence.getSequences()) {
for (Entity entity : classifiedDoc.getEntities().get(page)) {
float height = textPositions.getTextPositions().get(0).getHeightDir() + 2;
float posXInit;
float posXEnd;
float posYInit;
float posYEnd;
if (textPositions.getTextPositions().get(0).getRotation() == 90) {
posXEnd = textPositions.getTextPositions().get(0).getYDirAdj() + 2;
posXInit = textPositions.getTextPositions().get(0).getYDirAdj() - height;
posYInit = textPositions.getTextPositions().get(0).getXDirAdj();
posYEnd = textPositions.getTextPositions()
.get(textPositions.getTextPositions().size() - 1)
.getXDirAdj() - height + 4;
} else {
posXInit = textPositions.getTextPositions().get(0).getXDirAdj();
posXEnd = textPositions.getTextPositions()
.get(textPositions.getTextPositions().size() - 1)
.getXDirAdj() + textPositions.getTextPositions()
.get(textPositions.getTextPositions().size() - 1)
.getWidth() + 1;
posYInit = textPositions.getTextPositions()
.get(0)
.getPageHeight() - textPositions.getTextPositions().get(0).getYDirAdj() - 2;
posYEnd = textPositions.getTextPositions()
.get(0)
.getPageHeight() - textPositions.getTextPositions()
.get(textPositions.getTextPositions().size() - 1)
.getYDirAdj() + 2;
}
Rectangle textHighlightRectangle = new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
List<PDAnnotation> annotations = pdPage.getAnnotations();
PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
highlight.constructAppearances();
PDRectangle annotationPosition = new PDRectangle();
annotationPosition.setLowerLeftX(posXInit);
annotationPosition.setLowerLeftY(posYEnd);
annotationPosition.setUpperRightX(posXEnd);
annotationPosition.setUpperRightY(posYEnd + height);
highlight.setRectangle(annotationPosition);
if (!flatRedaction && !isHint(entity)) {
highlight.setAnnotationName(entityPositionSequence.getId().toString());
highlight.setTitlePopup(entityPositionSequence.getId().toString());
highlight.setContents("\nRule " + entity.getMatchedRule() + " matched\n\n" + entity.getRedactionReason() + "\n\n" + "In Section : \"" + entity
.getHeadline() + "\"");
}
highlight.setQuadPoints(toQuadPoints(textHighlightRectangle));
PDColor color;
if (flatRedaction) {
color = new PDColor(new float[]{0, 0, 0}, PDDeviceRGB.INSTANCE);
} else {
color = new PDColor(getColor(entity), PDDeviceRGB.INSTANCE);
}
highlight.setColor(color);
annotations.add(highlight);
redactionLogEntry.getPositions().add(textHighlightRectangle);
}
redactionLogEntry.setId(entityPositionSequence.getId().toString());
}
redactionLogEntry.setColor(getColor(entity));
redactionLogEntry.setReason(entity.getRedactionReason());
redactionLogEntry.setValue(entity.getWord());
redactionLogEntry.setType(entity.getType());
redactionLogEntry.setRedacted(entity.isRedaction());
redactionLogEntry.setSection(entity.getHeadline());
redactionLogEntry.setHint(isHint(entity));
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
redactionLogEntry.setSectionNumber(entity.getSectionNumber());
if (flatRedaction && !isRedactionType(entity)) {
continue;
}
RedactionLogEntry redactionLogEntry = createRedactionLogEntry(entity);
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
if (manualRedactions != null && manualRedactions.getIdsToRemove()
.contains(entityPositionSequence.getId())) {
entity.setRedaction(false);
entity.setRedactionReason(entity.getRedactionReason() + ", removed by manual override");
redactionLogEntry.setManual(true);
}
for (TextPositionSequence textPositions : entityPositionSequence.getSequences()) {
Rectangle rectangle = textPositions.getRectangle();
redactionLogEntry.getPositions().add(rectangle);
annotations.add(createAnnotation(rectangle, entityPositionSequence.getId(), createAnnotationContent(entity), getColor(entity), !flatRedaction && !isHint(entity)));
}
redactionLogEntry.setId(entityPositionSequence.getId());
}
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
}
}
private void addManualAnnotations(PDPage pdPage, Document classifiedDoc, ManualRedactions manualRedactions, int page) throws IOException {
if (manualRedactions == null) {
return;
}
List<PDAnnotation> annotations = pdPage.getAnnotations();
for (ManualRedactionEntry manualRedactionEntry : manualRedactions.getEntriesToAdd()) {
String id = IdBuilder.buildId(manualRedactionEntry);
RedactionLogEntry redactionLogEntry = createRedactionLogEntry(manualRedactionEntry);
for (Rectangle rectangle : manualRedactionEntry.getPositions()) {
if (page != rectangle.getPage()) {
continue;
}
PDAnnotationTextMarkup highlight = createAnnotation(rectangle, id, createAnnotationContent(manualRedactionEntry), getColor(manualRedactionEntry
.getType()), true);
annotations.add(highlight);
redactionLogEntry.getPositions().add(rectangle);
}
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
}
}
private RedactionLogEntry createRedactionLogEntry(ManualRedactionEntry manualRedactionEntry) {
return RedactionLogEntry.builder()
.color(getColor(manualRedactionEntry.getType()))
.reason(manualRedactionEntry.getReason())
.value(manualRedactionEntry.getValue())
.type(manualRedactionEntry.getType())
.redacted(true)
.isHint(false)
.section(manualRedactionEntry.getSection())
.sectionNumber(manualRedactionEntry.getSectionNumber())
.manual(true)
.build();
}
private RedactionLogEntry createRedactionLogEntry(Entity entity) {
return RedactionLogEntry.builder()
.color(getColor(entity))
.reason(entity.getRedactionReason())
.value(entity.getWord())
.type(entity.getType())
.redacted(entity.isRedaction())
.isHint(isHint(entity))
.section(entity.getHeadline())
.sectionNumber(entity.getSectionNumber())
.build();
}
private PDAnnotationTextMarkup createAnnotation(Rectangle rectangle, String id, String content, float[] color, boolean popup) {
PDAnnotationTextMarkup annotation = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
annotation.constructAppearances();
annotation.setRectangle(toPDRectangle(rectangle));
annotation.setQuadPoints(toQuadPoints(rectangle));
if (popup) {
annotation.setAnnotationName(id);
annotation.setTitlePopup(id);
annotation.setContents(content);
}
annotation.setColor(new PDColor(color, PDDeviceRGB.INSTANCE));
return annotation;
}
private String createAnnotationContent(Entity entity) {
return new StringBuilder().append("\nRule ")
.append(entity.getMatchedRule())
.append(" matched")
.append("\n\n")
.append(entity.getRedactionReason())
.append("\n\nIn Section : \"")
.append(entity.getHeadline())
.append("\"")
.toString();
}
private String createAnnotationContent(ManualRedactionEntry entry) {
return new StringBuilder().append("\nManual Redaction")
.append("\n\nIn Section : \"")
.append(entry.getSection())
.append("\"")
.toString();
}
private PDRectangle toPDRectangle(Rectangle rectangle) {
PDRectangle annotationPosition = new PDRectangle();
annotationPosition.setLowerLeftX(rectangle.getTopLeft().getX());
annotationPosition.setLowerLeftY(rectangle.getTopLeft().getY() + rectangle.getHeight());
annotationPosition.setUpperRightX(rectangle.getTopLeft().getX() + rectangle.getWidth());
annotationPosition.setUpperRightY(rectangle.getTopLeft().getY());
return annotationPosition;
}
private float[] toQuadPoints(Rectangle rectangle) {
// quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
@ -202,15 +236,24 @@ public class AnnotationHighlightService {
if (!entity.isRedaction() && !isHint(entity)) {
return new float[]{0.627f, 0.627f, 0.627f};
}
if (!dictionaryService.getEntryColors().containsKey(entity.getType())) {
return dictionaryService.getDefaultColor();
}
return dictionaryService.getEntryColors().get(entity.getType());
}
private float[] getColor(String type) {
if (!dictionaryService.getEntryColors().containsKey(type)) {
return dictionaryService.getDefaultColor();
}
return dictionaryService.getEntryColors().get(type);
}
private boolean isHint(Entity entity) {
List<String> hintTypes = dictionaryService.getHintTypes();
if (CollectionUtils.isNotEmpty(hintTypes) && hintTypes.contains(entity.getType())) {
return true;
@ -218,24 +261,49 @@ public class AnnotationHighlightService {
return false;
}
private void drawSectionFrames(PDDocument document, Document classifiedDoc, boolean flatRedaction, PDPage pdPage, int page) throws IOException {
if (flatRedaction) {
return;
}
PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i);
if (textBlock.getPage() != page) {
continue;
}
if (textBlock instanceof TextBlock) {
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
visualizeTextBlock((TextBlock) textBlock, contentStream);
} else if (textBlock instanceof Table) {
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
visualizeTable((Table) textBlock, contentStream);
}
}
}
contentStream.close();
}
private void visualizeTextBlock(TextBlock textBlock, PDPageContentStream contentStream) throws IOException {
contentStream.setStrokingColor(Color.LIGHT_GRAY);
contentStream.setLineWidth(0.5f);
contentStream.addRect(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight());
contentStream.stroke();
if (textBlock.getClassification() != null) {
contentStream.beginText();
contentStream.setNonStrokingColor(Color.DARK_GRAY);
contentStream.setFont(PDType1Font.TIMES_ROMAN, 8f);
contentStream.newLineAtOffset(textBlock.getMinX(), textBlock.getMaxY());
contentStream.showText(textBlock.getClassification());
contentStream.endText();
}
}
@ -252,26 +320,16 @@ public class AnnotationHighlightService {
contentStream.addRect((float) cell.getX(), (float) cell.getY(), (float) cell.getWidth(), (float) cell
.getHeight());
contentStream.stroke();
// contentStream.setStrokingColor(Color.GREEN);
// for (TextBlock textBlock : cell.getTextBlocks()) {
// contentStream.addRect(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight());
// contentStream.stroke();
// }
}
}
}
if (table.getClassification() != null) {
contentStream.beginText();
contentStream.setNonStrokingColor(Color.DARK_GRAY);
contentStream.setFont(PDType1Font.TIMES_ROMAN, 8f);
contentStream.newLineAtOffset(table.getMinX(), table.getMinY());
contentStream.showText(table.getClassification());
contentStream.endText();
}
}

View File

@ -17,6 +17,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
@ -41,6 +42,10 @@ import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
@ -303,6 +308,40 @@ public class RedactionIntegrationTest {
}
@Test
public void testManualRedaction() throws IOException {
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
ManualRedactions manualRedactions = new ManualRedactions();
manualRedactions.setIdsToRemove(Set.of("0836727c3508a0b2ea271da69c04cc2f"));
ManualRedactionEntry manualRedactionEntry = new ManualRedactionEntry();
manualRedactionEntry.setType("name");
manualRedactionEntry.setValue("O'Loughlin C.K.");
manualRedactionEntry.setReason("Manual Redaction");
manualRedactionEntry.setPositions(List.of(new Rectangle(new Point(375.61096f, 241.282f), 7.648041f, 43.72262f, 1), new Rectangle(new Point(384.83517f, 241.282f), 7.648041f, 17.043358f, 1)));
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.manualRedactions(manualRedactions)
.build();
RedactionResult result = redactionController.redact(request);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Redacted.pdf")) {
fileOutputStream.write(result.getDocument());
}
long end = System.currentTimeMillis();
System.out.println("duration: " + (end - start));
System.out.println("numberOfPages: " + result.getNumberOfPages());
}
@Test
public void classificationTest() throws IOException {

View File

@ -147,7 +147,7 @@ public class EntityRedactionServiceTest {
when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor());
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
}