RED-125: Section must know its headlines
RED-156: Return RedactionLog
This commit is contained in:
parent
0ed8530cb5
commit
8389a92820
@ -0,0 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class Point {
|
||||
|
||||
private float x;
|
||||
private float y;
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class Rectangle {
|
||||
|
||||
private Point topLeft;
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
private int page;
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class RedactionLog {
|
||||
|
||||
private List<RedactionLogEntry> redactionLogEntry;
|
||||
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
package com.iqser.red.service.redaction.v1.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class RedactionLogEntry {
|
||||
|
||||
private String id;
|
||||
private String type;
|
||||
private String value;
|
||||
private String reason;
|
||||
private boolean redacted;
|
||||
private String section;
|
||||
private float[] color;
|
||||
private List<Rectangle> positions = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -13,4 +13,6 @@ public class RedactionResult {
|
||||
|
||||
private byte[] document;
|
||||
private int numberOfPages;
|
||||
private RedactionLog redactionLog;
|
||||
|
||||
}
|
||||
|
||||
@ -6,6 +6,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
|
||||
import lombok.Data;
|
||||
@ -23,4 +24,6 @@ public class Document {
|
||||
private StringFrequencyCounter fontCounter= new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
private boolean headlines;
|
||||
|
||||
private List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
|
||||
}
|
||||
|
||||
@ -16,6 +16,7 @@ import lombok.NoArgsConstructor;
|
||||
public class Paragraph {
|
||||
|
||||
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
|
||||
private String headline;
|
||||
|
||||
public SearchableText getSearchableText(){
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.resources.RedactionResource;
|
||||
@ -48,10 +49,10 @@ public class RedactionController implements RedactionResource {
|
||||
|
||||
if (redactionRequest.isFlatRedaction()) {
|
||||
PDDocument flatDocument = pdfFlattenService.flattenPDF(pdDocument);
|
||||
return convert(flatDocument, classifiedDoc.getPages().size());
|
||||
return convert(flatDocument, classifiedDoc.getPages().size(), new RedactionLog(classifiedDoc.getRedactionLogEntities()));
|
||||
}
|
||||
|
||||
return convert(pdDocument, classifiedDoc.getPages().size());
|
||||
return convert(pdDocument, classifiedDoc.getPages().size(), new RedactionLog(classifiedDoc.getRedactionLogEntities()));
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
@ -124,16 +125,22 @@ public class RedactionController implements RedactionResource {
|
||||
droolsExecutionService.updateRules(rules);
|
||||
}
|
||||
|
||||
|
||||
private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException {
|
||||
return convert(document, numberOfPages, null);
|
||||
}
|
||||
|
||||
private RedactionResult convert(PDDocument document, int numberOfPages, RedactionLog redactionLog) throws IOException {
|
||||
|
||||
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
|
||||
document.save(byteArrayOutputStream);
|
||||
return RedactionResult.builder()
|
||||
.document(byteArrayOutputStream.toByteArray())
|
||||
.numberOfPages(numberOfPages)
|
||||
.redactionLog(redactionLog)
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -16,19 +16,24 @@ public class Entity {
|
||||
private List<EntityPositionSequence> positionSequences = new ArrayList<>();
|
||||
private Integer start;
|
||||
private Integer end;
|
||||
private String headline;
|
||||
private int matchedRule;
|
||||
|
||||
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences) {
|
||||
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule) {
|
||||
this.word = word;
|
||||
this.type = type;
|
||||
this.redaction = redaction;
|
||||
this.redactionReason = redactionReason;
|
||||
this.positionSequences = positionSequences;
|
||||
this.headline = headline;
|
||||
this.matchedRule = matchedRule;
|
||||
}
|
||||
|
||||
public Entity(String word, String type, Integer start, Integer end) {
|
||||
public Entity(String word, String type, Integer start, Integer end, String headline) {
|
||||
this.word = word;
|
||||
this.type = type;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.headline = headline;
|
||||
}
|
||||
}
|
||||
|
||||
@ -23,6 +23,8 @@ public class Section {
|
||||
//This does not contain linebreaks and must always be used for correct offsets.
|
||||
private String searchText;
|
||||
|
||||
private String headline;
|
||||
|
||||
public boolean contains(String type) {
|
||||
return entities.stream().anyMatch(entity -> entity.getType().equals(type));
|
||||
}
|
||||
@ -31,7 +33,8 @@ public class Section {
|
||||
entities.forEach(entity -> {
|
||||
if(entity.getType().equals(type)){
|
||||
entity.setRedaction(true);
|
||||
entity.setRedactionReason("\nRule " + ruleNumber + " matched\n\n" +reason);
|
||||
entity.setMatchedRule(ruleNumber);
|
||||
entity.setRedactionReason(reason);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -40,7 +43,8 @@ public class Section {
|
||||
entities.forEach(entity -> {
|
||||
if(entity.getType().equals(type)){
|
||||
entity.setRedaction(false);
|
||||
entity.setRedactionReason("\nRule " + ruleNumber + " matched\n\n" +reason);
|
||||
entity.setMatchedRule(ruleNumber);
|
||||
entity.setRedactionReason(reason);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -67,7 +71,8 @@ public class Section {
|
||||
entities.forEach(entity -> {
|
||||
if(entity.getType().equals(asType)){
|
||||
entity.setRedaction(true);
|
||||
entity.setRedactionReason("\nRule " + ruleNumber + " matched\n\n" +reason);
|
||||
entity.setMatchedRule(ruleNumber);
|
||||
entity.setRedactionReason(reason);
|
||||
}
|
||||
});
|
||||
|
||||
@ -88,7 +93,8 @@ public class Section {
|
||||
entities.forEach(entity -> {
|
||||
if(entity.getType().equals(asType)){
|
||||
entity.setRedaction(true);
|
||||
entity.setRedactionReason("\nRule " + ruleNumber + " matched\n\n" +reason);
|
||||
entity.setMatchedRule(ruleNumber);
|
||||
entity.setRedactionReason(reason);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -109,7 +115,7 @@ public class Section {
|
||||
if (startIndex > -1 &&
|
||||
(startIndex == 0 || Character.isWhitespace(searchText.charAt(startIndex - 1)) || isSeparator(searchText.charAt(startIndex - 1))) &&
|
||||
(stopIndex == searchText.length() || isSeparator(searchText.charAt(stopIndex)))) {
|
||||
found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex));
|
||||
found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex, headline));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
|
||||
|
||||
@ -27,6 +27,7 @@ public class EntityRedactionService {
|
||||
private final DictionaryService dictionaryService;
|
||||
private final DroolsExecutionService droolsExecutionService;
|
||||
|
||||
|
||||
public void processDocument(Document classifiedDoc) {
|
||||
|
||||
dictionaryService.updateDictionary();
|
||||
@ -55,12 +56,13 @@ public class EntityRedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
Set<Entity> entities = findEntities(searchableText);
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline());
|
||||
Section analysedSection = droolsExecutionService.executeRules(Section
|
||||
.builder()
|
||||
.entities(entities)
|
||||
.text(searchableText.getAsStringWithLinebreaks())
|
||||
.searchText(searchableText.toString())
|
||||
.headline(paragraph.getHeadline())
|
||||
.build());
|
||||
|
||||
for (Entity entity : analysedSection.getEntities()) {
|
||||
@ -70,13 +72,14 @@ public class EntityRedactionService {
|
||||
documentEntities.addAll(analysedSection.getEntities());
|
||||
|
||||
for (SearchableText searchableRow : searchableRows) {
|
||||
Set<Entity> rowEntities = findEntities(searchableRow);
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, "//TODO TableHeader");
|
||||
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(Section
|
||||
.builder()
|
||||
.entities(rowEntities)
|
||||
.text(searchableRow.getAsStringWithLinebreaks())
|
||||
.searchText(searchableRow.toString())
|
||||
.headline("//TODO TableHeader")
|
||||
.build());
|
||||
|
||||
for (Entity entity : analysedRowSection.getEntities()) {
|
||||
@ -89,13 +92,14 @@ public class EntityRedactionService {
|
||||
documentEntities.forEach(entity -> {
|
||||
entity.getPositionSequences().forEach(sequence -> {
|
||||
classifiedDoc.getEntities().computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>()).add(
|
||||
new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List.of(sequence))
|
||||
new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List.of(sequence), entity.getHeadline(), entity.getMatchedRule())
|
||||
);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText) {
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
|
||||
|
||||
String normalizedInputString = searchableText.toString();
|
||||
|
||||
@ -111,7 +115,7 @@ public class EntityRedactionService {
|
||||
if (startIndex > -1 &&
|
||||
(startIndex == 0 || Character.isWhitespace(normalizedInputString.charAt(startIndex - 1)) || isSeparator(normalizedInputString.charAt(startIndex - 1))) &&
|
||||
(stopIndex == normalizedInputString.length() || isSeparator(normalizedInputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex));
|
||||
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex, headline));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
}
|
||||
@ -126,6 +130,7 @@ public class EntityRedactionService {
|
||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
||||
}
|
||||
|
||||
|
||||
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
List<Entity> wordsToRemove = new ArrayList<>();
|
||||
for (Entity word : entities) {
|
||||
@ -138,4 +143,5 @@ public class EntityRedactionService {
|
||||
entities.removeAll(wordsToRemove);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -21,10 +21,11 @@ public class SectionsBuilderService {
|
||||
public void buildSections(Document document) {
|
||||
|
||||
List<AbstractTextContainer> chunkWords = new ArrayList<>();
|
||||
List<Paragraph> chunkBlockList1 = new ArrayList<>();
|
||||
List<Paragraph> chunkBlockList = new ArrayList<>();
|
||||
|
||||
AbstractTextContainer prev = null;
|
||||
|
||||
String lastHeadline = "";
|
||||
for (Page page : document.getPages()) {
|
||||
for (AbstractTextContainer current : page.getTextBlocks()) {
|
||||
|
||||
@ -37,7 +38,9 @@ public class SectionsBuilderService {
|
||||
if (prev != null && current.getClassification().startsWith("H ") || !document.isHeadlines()) {
|
||||
|
||||
Paragraph cb1 = buildTextBlock(chunkWords);
|
||||
chunkBlockList1.add(cb1);
|
||||
cb1.setHeadline(lastHeadline);
|
||||
lastHeadline = current.getText();
|
||||
chunkBlockList.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -50,10 +53,11 @@ public class SectionsBuilderService {
|
||||
|
||||
Paragraph cb1 = buildTextBlock(chunkWords);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList1.add(cb1);
|
||||
chunkBlockList.add(cb1);
|
||||
cb1.setHeadline(lastHeadline);
|
||||
}
|
||||
|
||||
document.setParagraphs(chunkBlockList1);
|
||||
document.setParagraphs(chunkBlockList);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -20,6 +20,9 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
|
||||
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
@ -77,6 +80,8 @@ public class AnnotationHighlightService {
|
||||
|
||||
for (Entity entity : classifiedDoc.getEntities().get(page)) {
|
||||
|
||||
RedactionLogEntry redactionLogEntry = new RedactionLogEntry();
|
||||
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
|
||||
if (flatRedaction && !isRedactionType(entity)) {
|
||||
@ -91,47 +96,42 @@ public class AnnotationHighlightService {
|
||||
float posXEnd;
|
||||
float posYInit;
|
||||
float posYEnd;
|
||||
float[] quadPoints;
|
||||
|
||||
if (textPositions.getTextPositions().get(0).getRotation() == 90) {
|
||||
|
||||
posXEnd = textPositions.getTextPositions().get(0).getYDirAdj() + 2;
|
||||
posXInit = textPositions.getTextPositions().get(0).getYDirAdj() - height;
|
||||
posYInit = textPositions.getTextPositions().get(0).getXDirAdj();
|
||||
posYEnd = textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getXDirAdj() - height + 2;
|
||||
|
||||
quadPoints = new float[]{posXInit, posYInit, posXInit, posYEnd + height + 2, posXEnd, posYInit, posXEnd, posYEnd + height + 2};
|
||||
posYEnd = textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getXDirAdj() - height + 4;
|
||||
} else {
|
||||
|
||||
posXInit = textPositions.getTextPositions().get(0).getXDirAdj();
|
||||
posXEnd = textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getXDirAdj() + textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getWidth() + 1;
|
||||
posYInit = textPositions.getTextPositions().get(0).getPageHeight() - textPositions.getTextPositions().get(0).getYDirAdj();
|
||||
posYEnd = textPositions.getTextPositions().get(0).getPageHeight() - textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getYDirAdj();
|
||||
quadPoints = new float[]{posXInit, posYEnd + height + 2, posXEnd, posYEnd + height + 2, posXInit, posYInit - 2, posXEnd, posYEnd - 2};
|
||||
posYInit = textPositions.getTextPositions().get(0).getPageHeight() - textPositions.getTextPositions().get(0).getYDirAdj() - 2;
|
||||
posYEnd = textPositions.getTextPositions().get(0).getPageHeight() - textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getYDirAdj() + 2;
|
||||
}
|
||||
|
||||
Rectangle textHighlightRectangle = new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
|
||||
|
||||
|
||||
List<PDAnnotation> annotations = pdPage.getAnnotations();
|
||||
PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
|
||||
highlight.constructAppearances();
|
||||
|
||||
PDRectangle position = new PDRectangle();
|
||||
position.setLowerLeftX(posXInit);
|
||||
position.setLowerLeftY(posYEnd);
|
||||
position.setUpperRightX(posXEnd);
|
||||
position.setUpperRightY(posYEnd + height);
|
||||
PDRectangle annotationPosition = new PDRectangle();
|
||||
annotationPosition.setLowerLeftX(posXInit);
|
||||
annotationPosition.setLowerLeftY(posYEnd);
|
||||
annotationPosition.setUpperRightX(posXEnd);
|
||||
annotationPosition.setUpperRightY(posYEnd + height);
|
||||
|
||||
highlight.setRectangle(position);
|
||||
highlight.setRectangle(annotationPosition);
|
||||
if (!flatRedaction) {
|
||||
highlight.setAnnotationName(entityPositionSequence.getId().toString());
|
||||
highlight.setTitlePopup(entityPositionSequence.getId().toString());
|
||||
highlight.setContents(entity.getRedactionReason());
|
||||
highlight.setContents("\nRule " + entity.getMatchedRule() + " matched\n\n" + entity.getRedactionReason() + "\n\n" + "In Section : \"" + entity.getHeadline() + "\"");
|
||||
}
|
||||
|
||||
// quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
|
||||
// of the area to be highlighted
|
||||
|
||||
highlight.setQuadPoints(quadPoints);
|
||||
highlight.setQuadPoints(toQuadPoints(textHighlightRectangle));
|
||||
|
||||
PDColor color;
|
||||
if (flatRedaction) {
|
||||
@ -142,14 +142,37 @@ public class AnnotationHighlightService {
|
||||
|
||||
highlight.setColor(color);
|
||||
annotations.add(highlight);
|
||||
|
||||
redactionLogEntry.getPositions().add(textHighlightRectangle);
|
||||
|
||||
}
|
||||
redactionLogEntry.setId(entityPositionSequence.getId().toString());
|
||||
}
|
||||
redactionLogEntry.setColor(getColor(entity));
|
||||
redactionLogEntry.setReason(entity.getRedactionReason());
|
||||
redactionLogEntry.setValue(entity.getWord());
|
||||
redactionLogEntry.setType(entity.getType());
|
||||
redactionLogEntry.setRedacted(entity.isRedaction());
|
||||
redactionLogEntry.setSection(entity.getHeadline());
|
||||
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private float[] toQuadPoints(Rectangle rectangle) {
|
||||
|
||||
// quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
|
||||
// of the area to be highlighted
|
||||
return new float[]{
|
||||
rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY(),
|
||||
rectangle.getTopLeft().getX() + rectangle.getWidth(), rectangle.getTopLeft().getY(),
|
||||
rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(),
|
||||
rectangle.getTopLeft().getX() + rectangle.getWidth(), rectangle.getTopLeft().getY() + rectangle.getHeight()};
|
||||
}
|
||||
|
||||
|
||||
private boolean isRedactionType(Entity entity) {
|
||||
if (!entity.isRedaction()) {
|
||||
return false;
|
||||
|
||||
@ -174,4 +174,4 @@ public class RedactionIntegrationTest {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user