RED-88: Enabled manual redaction
This commit is contained in:
parent
954765759c
commit
2b93ae57d5
@ -0,0 +1,23 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class ManualRedactionEntry {
|
||||
|
||||
private String type;
|
||||
private String value;
|
||||
private String reason;
|
||||
private List<Rectangle> positions = new ArrayList<>();
|
||||
|
||||
private String section;
|
||||
private int sectionNumber;
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class ManualRedactions {
|
||||
|
||||
Set<String> idsToRemove = new HashSet<>();
|
||||
Set<ManualRedactionEntry> entriesToAdd = new HashSet<>();
|
||||
}
|
||||
@ -3,9 +3,11 @@ package com.iqser.red.service.redaction.v1.model;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class RedactionLogEntry {
|
||||
|
||||
private String id;
|
||||
@ -16,7 +18,10 @@ public class RedactionLogEntry {
|
||||
private boolean isHint;
|
||||
private String section;
|
||||
private float[] color;
|
||||
|
||||
@Builder.Default
|
||||
private List<Rectangle> positions = new ArrayList<>();
|
||||
private int sectionNumber;
|
||||
private boolean manual;
|
||||
|
||||
}
|
||||
|
||||
@ -13,4 +13,5 @@ public class RedactionRequest {
|
||||
|
||||
private byte[] document;
|
||||
private boolean flatRedaction;
|
||||
private ManualRedactions manualRedactions;
|
||||
}
|
||||
|
||||
@ -56,6 +56,11 @@
|
||||
<artifactId>jts-core</artifactId>
|
||||
<version>1.16.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>29.0-jre</version>
|
||||
</dependency>
|
||||
<!-- commons -->
|
||||
<dependency>
|
||||
<groupId>com.iqser.gin4.commons</groupId>
|
||||
|
||||
@ -41,4 +41,16 @@ public class Paragraph {
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
public List<TextBlock> getTextBlocks() {
|
||||
|
||||
List<TextBlock> textBlocks = new ArrayList<>();
|
||||
pageBlocks.forEach(block -> {
|
||||
if (block instanceof TextBlock) {
|
||||
textBlocks.add((TextBlock) block);
|
||||
}
|
||||
});
|
||||
return textBlocks;
|
||||
}
|
||||
|
||||
}
|
||||
@ -44,8 +44,8 @@ public class RedactionController implements RedactionResource {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc);
|
||||
annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction());
|
||||
entityRedactionService.processDocument(classifiedDoc, redactionRequest.getManualRedactions());
|
||||
annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction(), redactionRequest.getManualRedactions());
|
||||
|
||||
if (redactionRequest.isFlatRedaction()) {
|
||||
PDDocument flatDocument = pdfFlattenService.flattenPDF(pdDocument);
|
||||
|
||||
@ -5,6 +5,9 @@ import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -22,36 +25,48 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
private final int page;
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page){
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions;
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
return textPositions.size();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return text.charAt(0);
|
||||
}
|
||||
|
||||
|
||||
public char charAt(int index, boolean caseInSensitive) {
|
||||
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextPositionSequence subSequence(int start, int end) {
|
||||
|
||||
return new TextPositionSequence(textPositions.subList(start, end), page);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder builder = new StringBuilder(length());
|
||||
for (int i = 0; i < length(); i++) {
|
||||
builder.append(charAt(i));
|
||||
@ -59,15 +74,21 @@ public class TextPositionSequence implements CharSequence {
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
|
||||
public TextPosition textPositionAt(int index) {
|
||||
|
||||
return textPositions.get(index);
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(textPosition);
|
||||
}
|
||||
|
||||
|
||||
public float getX1() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
||||
} else {
|
||||
@ -75,15 +96,20 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public float getX2() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
return textPositions.get(0).getYDirAdj();
|
||||
} else {
|
||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1;
|
||||
return textPositions.get(textPositions.size() - 1)
|
||||
.getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public float getY1() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
return textPositions.get(0).getXDirAdj();
|
||||
} else {
|
||||
@ -91,30 +117,46 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public float getY2() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + getTextHeight() -2 ;
|
||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + getTextHeight() - 2;
|
||||
} else {
|
||||
return textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() + getTextHeight();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + 2;
|
||||
}
|
||||
|
||||
|
||||
public float getHeight() {
|
||||
|
||||
return getY2() - getY1();
|
||||
}
|
||||
|
||||
|
||||
public float getWidth() {
|
||||
|
||||
return getX2() - getX1();
|
||||
}
|
||||
|
||||
|
||||
public String getFont() {
|
||||
return textPositions.get(0).getFont().toString().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
|
||||
|
||||
return textPositions.get(0)
|
||||
.getFont()
|
||||
.toString()
|
||||
.toLowerCase()
|
||||
.replaceAll(",bold", "")
|
||||
.replaceAll(",italic", "");
|
||||
}
|
||||
|
||||
|
||||
public String getFontStyle() {
|
||||
|
||||
String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase();
|
||||
@ -131,16 +173,51 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public float getFontSize() {
|
||||
|
||||
return textPositions.get(0).getFontSizeInPt();
|
||||
}
|
||||
|
||||
|
||||
public float getSpaceWidth() {
|
||||
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
}
|
||||
|
||||
|
||||
public int getRotation() {
|
||||
|
||||
return textPositions.get(0).getRotation();
|
||||
}
|
||||
|
||||
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
float height = textPositions.get(0).getHeightDir() + 2;
|
||||
|
||||
float posXInit;
|
||||
float posXEnd;
|
||||
float posYInit;
|
||||
float posYEnd;
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
|
||||
posXEnd = textPositions.get(0).getYDirAdj() + 2;
|
||||
posXInit = textPositions.get(0).getYDirAdj() - height;
|
||||
posYInit = textPositions.get(0).getXDirAdj();
|
||||
posYEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() - height + 4;
|
||||
} else {
|
||||
|
||||
posXInit = textPositions.get(0).getXDirAdj();
|
||||
posXEnd = textPositions.get(textPositions.size() - 1)
|
||||
.getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1;
|
||||
posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2;
|
||||
posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1)
|
||||
.getYDirAdj() + 2;
|
||||
}
|
||||
|
||||
return new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
@ -20,6 +19,6 @@ public class EntityPositionSequence {
|
||||
@EqualsAndHashCode.Exclude
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
private int pageNumber;
|
||||
private final UUID id;
|
||||
private final String id;
|
||||
|
||||
}
|
||||
|
||||
@ -2,10 +2,10 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
public class SearchableText {
|
||||
@ -114,7 +114,7 @@ public class SearchableText {
|
||||
|
||||
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts) {
|
||||
|
||||
UUID id = UUID.randomUUID();
|
||||
String id = IdBuilder.buildId(crossSequenceParts);
|
||||
List<EntityPositionSequence> result = new ArrayList<>();
|
||||
int currentPage = -1;
|
||||
EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id);
|
||||
|
||||
@ -12,6 +12,9 @@ import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
@ -34,7 +37,7 @@ public class EntityRedactionService {
|
||||
private final DroolsExecutionService droolsExecutionService;
|
||||
|
||||
|
||||
public void processDocument(Document classifiedDoc) {
|
||||
public void processDocument(Document classifiedDoc, ManualRedactions manualRedactions) {
|
||||
|
||||
dictionaryService.updateDictionary();
|
||||
droolsExecutionService.updateRules();
|
||||
@ -58,6 +61,7 @@ public class EntityRedactionService {
|
||||
continue;
|
||||
}
|
||||
cellValues.add(cell.getTextBlocks().get(0).getText());
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
|
||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
searchableRow.addAll(textBlock.getSequences());
|
||||
}
|
||||
@ -80,6 +84,7 @@ public class EntityRedactionService {
|
||||
sectionNumber++;
|
||||
}
|
||||
|
||||
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber);
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber);
|
||||
Section analysedSection = droolsExecutionService.executeRules(Section.builder()
|
||||
.entities(entities)
|
||||
@ -103,8 +108,7 @@ public class EntityRedactionService {
|
||||
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
|
||||
classifiedDoc.getEntities()
|
||||
.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(),
|
||||
entity.getRedactionReason(), entry
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber()));
|
||||
}
|
||||
}
|
||||
@ -184,8 +188,7 @@ public class EntityRedactionService {
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex,
|
||||
headline, sectionNumber));
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
}
|
||||
@ -213,4 +216,23 @@ public class EntityRedactionService {
|
||||
entities.removeAll(wordsToRemove);
|
||||
}
|
||||
|
||||
|
||||
private void addSectionToManualRedactions(List<TextBlock> textBlocks, ManualRedactions manualRedactions, String section, int sectionNumber) {
|
||||
|
||||
if (manualRedactions == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (TextBlock textBlock : textBlocks) {
|
||||
for (ManualRedactionEntry manualRedactionEntry : manualRedactions.getEntriesToAdd()) {
|
||||
for (Rectangle rectangle : manualRedactionEntry.getPositions()) {
|
||||
if (textBlock.contains(rectangle)) {
|
||||
manualRedactionEntry.setSection(section);
|
||||
manualRedactionEntry.setSectionNumber(sectionNumber);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,31 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class IdBuilder {
|
||||
|
||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||
|
||||
public String buildId(List<TextPositionSequence> crossSequenceParts) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
crossSequenceParts.forEach(sequencePart -> sequencePart.getTextPositions().forEach(textPosition -> {
|
||||
sb.append(textPosition.getTextMatrix());
|
||||
}));
|
||||
|
||||
return hashFunction.hashBytes(sb.toString().getBytes()).toString();
|
||||
}
|
||||
|
||||
|
||||
public String buildId(ManualRedactionEntry manualRedactionEntry) {
|
||||
return hashFunction.hashBytes(manualRedactionEntry.toString().getBytes()).toString();
|
||||
}
|
||||
}
|
||||
@ -1,5 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -22,4 +24,8 @@ public abstract class AbstractTextContainer {
|
||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||
}
|
||||
|
||||
public boolean contains(Rectangle other) {
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
}
|
||||
@ -16,7 +16,8 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
|
||||
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
@ -26,6 +27,7 @@ import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSeque
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
@ -41,139 +43,171 @@ public class AnnotationHighlightService {
|
||||
private final DictionaryService dictionaryService;
|
||||
|
||||
|
||||
public void highlight(PDDocument document, Document classifiedDoc, boolean flatRedaction) throws IOException {
|
||||
public void highlight(PDDocument document, Document classifiedDoc, boolean flatRedaction, ManualRedactions manualRedactions) throws IOException {
|
||||
|
||||
for (int page = 1; page <= document.getNumberOfPages(); page++) {
|
||||
|
||||
PDPage pdPage = document.getPage(page - 1);
|
||||
|
||||
if (!flatRedaction) {
|
||||
PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
|
||||
|
||||
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
|
||||
|
||||
AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i);
|
||||
|
||||
if (textBlock.getPage() != page) {
|
||||
continue;
|
||||
}
|
||||
if (textBlock instanceof TextBlock) {
|
||||
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
|
||||
visualizeTextBlock((TextBlock) textBlock, contentStream);
|
||||
} else if (textBlock instanceof Table) {
|
||||
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
|
||||
visualizeTable((Table) textBlock, contentStream);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
contentStream.close();
|
||||
}
|
||||
drawSectionFrames(document, classifiedDoc, flatRedaction, pdPage, page);
|
||||
|
||||
if (classifiedDoc.getEntities().get(page) == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (Entity entity : classifiedDoc.getEntities().get(page)) {
|
||||
addAnnotations(pdPage, classifiedDoc, flatRedaction, manualRedactions, page);
|
||||
addManualAnnotations(pdPage, classifiedDoc, manualRedactions, page);
|
||||
}
|
||||
}
|
||||
|
||||
RedactionLogEntry redactionLogEntry = new RedactionLogEntry();
|
||||
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
private void addAnnotations(PDPage pdPage, Document classifiedDoc, boolean flatRedaction, ManualRedactions manualRedactions, int page) throws IOException {
|
||||
|
||||
if (flatRedaction && !isRedactionType(entity)) {
|
||||
continue;
|
||||
}
|
||||
List<PDAnnotation> annotations = pdPage.getAnnotations();
|
||||
|
||||
for (TextPositionSequence textPositions : entityPositionSequence.getSequences()) {
|
||||
for (Entity entity : classifiedDoc.getEntities().get(page)) {
|
||||
|
||||
float height = textPositions.getTextPositions().get(0).getHeightDir() + 2;
|
||||
|
||||
float posXInit;
|
||||
float posXEnd;
|
||||
float posYInit;
|
||||
float posYEnd;
|
||||
|
||||
if (textPositions.getTextPositions().get(0).getRotation() == 90) {
|
||||
|
||||
posXEnd = textPositions.getTextPositions().get(0).getYDirAdj() + 2;
|
||||
posXInit = textPositions.getTextPositions().get(0).getYDirAdj() - height;
|
||||
posYInit = textPositions.getTextPositions().get(0).getXDirAdj();
|
||||
posYEnd = textPositions.getTextPositions()
|
||||
.get(textPositions.getTextPositions().size() - 1)
|
||||
.getXDirAdj() - height + 4;
|
||||
} else {
|
||||
|
||||
posXInit = textPositions.getTextPositions().get(0).getXDirAdj();
|
||||
posXEnd = textPositions.getTextPositions()
|
||||
.get(textPositions.getTextPositions().size() - 1)
|
||||
.getXDirAdj() + textPositions.getTextPositions()
|
||||
.get(textPositions.getTextPositions().size() - 1)
|
||||
.getWidth() + 1;
|
||||
posYInit = textPositions.getTextPositions()
|
||||
.get(0)
|
||||
.getPageHeight() - textPositions.getTextPositions().get(0).getYDirAdj() - 2;
|
||||
posYEnd = textPositions.getTextPositions()
|
||||
.get(0)
|
||||
.getPageHeight() - textPositions.getTextPositions()
|
||||
.get(textPositions.getTextPositions().size() - 1)
|
||||
.getYDirAdj() + 2;
|
||||
}
|
||||
|
||||
Rectangle textHighlightRectangle = new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
|
||||
|
||||
List<PDAnnotation> annotations = pdPage.getAnnotations();
|
||||
PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
|
||||
highlight.constructAppearances();
|
||||
|
||||
PDRectangle annotationPosition = new PDRectangle();
|
||||
annotationPosition.setLowerLeftX(posXInit);
|
||||
annotationPosition.setLowerLeftY(posYEnd);
|
||||
annotationPosition.setUpperRightX(posXEnd);
|
||||
annotationPosition.setUpperRightY(posYEnd + height);
|
||||
|
||||
highlight.setRectangle(annotationPosition);
|
||||
if (!flatRedaction && !isHint(entity)) {
|
||||
highlight.setAnnotationName(entityPositionSequence.getId().toString());
|
||||
highlight.setTitlePopup(entityPositionSequence.getId().toString());
|
||||
highlight.setContents("\nRule " + entity.getMatchedRule() + " matched\n\n" + entity.getRedactionReason() + "\n\n" + "In Section : \"" + entity
|
||||
.getHeadline() + "\"");
|
||||
}
|
||||
|
||||
highlight.setQuadPoints(toQuadPoints(textHighlightRectangle));
|
||||
|
||||
PDColor color;
|
||||
if (flatRedaction) {
|
||||
color = new PDColor(new float[]{0, 0, 0}, PDDeviceRGB.INSTANCE);
|
||||
} else {
|
||||
color = new PDColor(getColor(entity), PDDeviceRGB.INSTANCE);
|
||||
}
|
||||
|
||||
highlight.setColor(color);
|
||||
annotations.add(highlight);
|
||||
|
||||
redactionLogEntry.getPositions().add(textHighlightRectangle);
|
||||
|
||||
}
|
||||
redactionLogEntry.setId(entityPositionSequence.getId().toString());
|
||||
}
|
||||
redactionLogEntry.setColor(getColor(entity));
|
||||
redactionLogEntry.setReason(entity.getRedactionReason());
|
||||
redactionLogEntry.setValue(entity.getWord());
|
||||
redactionLogEntry.setType(entity.getType());
|
||||
redactionLogEntry.setRedacted(entity.isRedaction());
|
||||
redactionLogEntry.setSection(entity.getHeadline());
|
||||
redactionLogEntry.setHint(isHint(entity));
|
||||
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
|
||||
redactionLogEntry.setSectionNumber(entity.getSectionNumber());
|
||||
if (flatRedaction && !isRedactionType(entity)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
RedactionLogEntry redactionLogEntry = createRedactionLogEntry(entity);
|
||||
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
|
||||
if (manualRedactions != null && manualRedactions.getIdsToRemove()
|
||||
.contains(entityPositionSequence.getId())) {
|
||||
entity.setRedaction(false);
|
||||
entity.setRedactionReason(entity.getRedactionReason() + ", removed by manual override");
|
||||
redactionLogEntry.setManual(true);
|
||||
}
|
||||
|
||||
for (TextPositionSequence textPositions : entityPositionSequence.getSequences()) {
|
||||
|
||||
Rectangle rectangle = textPositions.getRectangle();
|
||||
redactionLogEntry.getPositions().add(rectangle);
|
||||
annotations.add(createAnnotation(rectangle, entityPositionSequence.getId(), createAnnotationContent(entity), getColor(entity), !flatRedaction && !isHint(entity)));
|
||||
}
|
||||
redactionLogEntry.setId(entityPositionSequence.getId());
|
||||
}
|
||||
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addManualAnnotations(PDPage pdPage, Document classifiedDoc, ManualRedactions manualRedactions, int page) throws IOException {
|
||||
|
||||
if (manualRedactions == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<PDAnnotation> annotations = pdPage.getAnnotations();
|
||||
|
||||
for (ManualRedactionEntry manualRedactionEntry : manualRedactions.getEntriesToAdd()) {
|
||||
|
||||
String id = IdBuilder.buildId(manualRedactionEntry);
|
||||
|
||||
RedactionLogEntry redactionLogEntry = createRedactionLogEntry(manualRedactionEntry);
|
||||
|
||||
for (Rectangle rectangle : manualRedactionEntry.getPositions()) {
|
||||
|
||||
if (page != rectangle.getPage()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
PDAnnotationTextMarkup highlight = createAnnotation(rectangle, id, createAnnotationContent(manualRedactionEntry), getColor(manualRedactionEntry
|
||||
.getType()), true);
|
||||
annotations.add(highlight);
|
||||
|
||||
redactionLogEntry.getPositions().add(rectangle);
|
||||
}
|
||||
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private RedactionLogEntry createRedactionLogEntry(ManualRedactionEntry manualRedactionEntry) {
|
||||
|
||||
return RedactionLogEntry.builder()
|
||||
.color(getColor(manualRedactionEntry.getType()))
|
||||
.reason(manualRedactionEntry.getReason())
|
||||
.value(manualRedactionEntry.getValue())
|
||||
.type(manualRedactionEntry.getType())
|
||||
.redacted(true)
|
||||
.isHint(false)
|
||||
.section(manualRedactionEntry.getSection())
|
||||
.sectionNumber(manualRedactionEntry.getSectionNumber())
|
||||
.manual(true)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private RedactionLogEntry createRedactionLogEntry(Entity entity) {
|
||||
|
||||
return RedactionLogEntry.builder()
|
||||
.color(getColor(entity))
|
||||
.reason(entity.getRedactionReason())
|
||||
.value(entity.getWord())
|
||||
.type(entity.getType())
|
||||
.redacted(entity.isRedaction())
|
||||
.isHint(isHint(entity))
|
||||
.section(entity.getHeadline())
|
||||
.sectionNumber(entity.getSectionNumber())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private PDAnnotationTextMarkup createAnnotation(Rectangle rectangle, String id, String content, float[] color, boolean popup) {
|
||||
|
||||
PDAnnotationTextMarkup annotation = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
|
||||
annotation.constructAppearances();
|
||||
annotation.setRectangle(toPDRectangle(rectangle));
|
||||
annotation.setQuadPoints(toQuadPoints(rectangle));
|
||||
if (popup) {
|
||||
annotation.setAnnotationName(id);
|
||||
annotation.setTitlePopup(id);
|
||||
annotation.setContents(content);
|
||||
}
|
||||
annotation.setColor(new PDColor(color, PDDeviceRGB.INSTANCE));
|
||||
return annotation;
|
||||
}
|
||||
|
||||
|
||||
private String createAnnotationContent(Entity entity) {
|
||||
|
||||
return new StringBuilder().append("\nRule ")
|
||||
.append(entity.getMatchedRule())
|
||||
.append(" matched")
|
||||
.append("\n\n")
|
||||
.append(entity.getRedactionReason())
|
||||
.append("\n\nIn Section : \"")
|
||||
.append(entity.getHeadline())
|
||||
.append("\"")
|
||||
.toString();
|
||||
}
|
||||
|
||||
|
||||
private String createAnnotationContent(ManualRedactionEntry entry) {
|
||||
|
||||
return new StringBuilder().append("\nManual Redaction")
|
||||
.append("\n\nIn Section : \"")
|
||||
.append(entry.getSection())
|
||||
.append("\"")
|
||||
.toString();
|
||||
}
|
||||
|
||||
|
||||
private PDRectangle toPDRectangle(Rectangle rectangle) {
|
||||
|
||||
PDRectangle annotationPosition = new PDRectangle();
|
||||
annotationPosition.setLowerLeftX(rectangle.getTopLeft().getX());
|
||||
annotationPosition.setLowerLeftY(rectangle.getTopLeft().getY() + rectangle.getHeight());
|
||||
annotationPosition.setUpperRightX(rectangle.getTopLeft().getX() + rectangle.getWidth());
|
||||
annotationPosition.setUpperRightY(rectangle.getTopLeft().getY());
|
||||
return annotationPosition;
|
||||
}
|
||||
|
||||
|
||||
private float[] toQuadPoints(Rectangle rectangle) {
|
||||
|
||||
// quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
|
||||
@ -202,15 +236,24 @@ public class AnnotationHighlightService {
|
||||
if (!entity.isRedaction() && !isHint(entity)) {
|
||||
return new float[]{0.627f, 0.627f, 0.627f};
|
||||
}
|
||||
|
||||
if (!dictionaryService.getEntryColors().containsKey(entity.getType())) {
|
||||
return dictionaryService.getDefaultColor();
|
||||
}
|
||||
|
||||
return dictionaryService.getEntryColors().get(entity.getType());
|
||||
}
|
||||
|
||||
|
||||
private float[] getColor(String type) {
|
||||
|
||||
if (!dictionaryService.getEntryColors().containsKey(type)) {
|
||||
return dictionaryService.getDefaultColor();
|
||||
}
|
||||
return dictionaryService.getEntryColors().get(type);
|
||||
}
|
||||
|
||||
|
||||
private boolean isHint(Entity entity) {
|
||||
|
||||
List<String> hintTypes = dictionaryService.getHintTypes();
|
||||
if (CollectionUtils.isNotEmpty(hintTypes) && hintTypes.contains(entity.getType())) {
|
||||
return true;
|
||||
@ -218,24 +261,49 @@ public class AnnotationHighlightService {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private void drawSectionFrames(PDDocument document, Document classifiedDoc, boolean flatRedaction, PDPage pdPage, int page) throws IOException {
|
||||
|
||||
if (flatRedaction) {
|
||||
return;
|
||||
}
|
||||
|
||||
PDPageContentStream contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
|
||||
|
||||
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
|
||||
|
||||
AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i);
|
||||
|
||||
if (textBlock.getPage() != page) {
|
||||
continue;
|
||||
}
|
||||
if (textBlock instanceof TextBlock) {
|
||||
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
|
||||
visualizeTextBlock((TextBlock) textBlock, contentStream);
|
||||
} else if (textBlock instanceof Table) {
|
||||
textBlock.setClassification((i + 1) + "/" + paragraph.getPageBlocks().size());
|
||||
visualizeTable((Table) textBlock, contentStream);
|
||||
}
|
||||
}
|
||||
}
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
|
||||
private void visualizeTextBlock(TextBlock textBlock, PDPageContentStream contentStream) throws IOException {
|
||||
|
||||
contentStream.setStrokingColor(Color.LIGHT_GRAY);
|
||||
contentStream.setLineWidth(0.5f);
|
||||
|
||||
contentStream.addRect(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight());
|
||||
contentStream.stroke();
|
||||
|
||||
if (textBlock.getClassification() != null) {
|
||||
contentStream.beginText();
|
||||
|
||||
contentStream.setNonStrokingColor(Color.DARK_GRAY);
|
||||
contentStream.setFont(PDType1Font.TIMES_ROMAN, 8f);
|
||||
|
||||
contentStream.newLineAtOffset(textBlock.getMinX(), textBlock.getMaxY());
|
||||
|
||||
contentStream.showText(textBlock.getClassification());
|
||||
|
||||
contentStream.endText();
|
||||
}
|
||||
}
|
||||
@ -252,26 +320,16 @@ public class AnnotationHighlightService {
|
||||
contentStream.addRect((float) cell.getX(), (float) cell.getY(), (float) cell.getWidth(), (float) cell
|
||||
.getHeight());
|
||||
contentStream.stroke();
|
||||
|
||||
// contentStream.setStrokingColor(Color.GREEN);
|
||||
// for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
// contentStream.addRect(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight());
|
||||
// contentStream.stroke();
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (table.getClassification() != null) {
|
||||
contentStream.beginText();
|
||||
|
||||
contentStream.setNonStrokingColor(Color.DARK_GRAY);
|
||||
contentStream.setFont(PDType1Font.TIMES_ROMAN, 8f);
|
||||
|
||||
contentStream.newLineAtOffset(table.getMinX(), table.getMinY());
|
||||
|
||||
contentStream.showText(table.getClassification());
|
||||
|
||||
contentStream.endText();
|
||||
}
|
||||
}
|
||||
|
||||
@ -17,6 +17,7 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
@ -41,6 +42,10 @@ import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
@ -303,6 +308,40 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testManualRedaction() throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
|
||||
|
||||
ManualRedactions manualRedactions = new ManualRedactions();
|
||||
manualRedactions.setIdsToRemove(Set.of("0836727c3508a0b2ea271da69c04cc2f"));
|
||||
|
||||
ManualRedactionEntry manualRedactionEntry = new ManualRedactionEntry();
|
||||
manualRedactionEntry.setType("name");
|
||||
manualRedactionEntry.setValue("O'Loughlin C.K.");
|
||||
manualRedactionEntry.setReason("Manual Redaction");
|
||||
manualRedactionEntry.setPositions(List.of(new Rectangle(new Point(375.61096f, 241.282f), 7.648041f, 43.72262f, 1), new Rectangle(new Point(384.83517f, 241.282f), 7.648041f, 17.043358f, 1)));
|
||||
|
||||
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.manualRedactions(manualRedactions)
|
||||
.build();
|
||||
|
||||
RedactionResult result = redactionController.redact(request);
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Redacted.pdf")) {
|
||||
fileOutputStream.write(result.getDocument());
|
||||
}
|
||||
long end = System.currentTimeMillis();
|
||||
|
||||
System.out.println("duration: " + (end - start));
|
||||
System.out.println("numberOfPages: " + result.getNumberOfPages());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void classificationTest() throws IOException {
|
||||
|
||||
|
||||
@ -147,7 +147,7 @@ public class EntityRedactionServiceTest {
|
||||
when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor());
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc);
|
||||
entityRedactionService.processDocument(classifiedDoc, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user