RED-125: Section must know its headlines

RED-156: Return RedactionLog
This commit is contained in:
deiflaender 2020-07-17 11:24:04 +02:00
parent 0ed8530cb5
commit 8389a92820
14 changed files with 164 additions and 39 deletions

View File

@ -0,0 +1,15 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class Point {
private float x;
private float y;
}

View File

@ -0,0 +1,17 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class Rectangle {
private Point topLeft;
private float width;
private float height;
private int page;
}

View File

@ -0,0 +1,16 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class RedactionLog {
private List<RedactionLogEntry> redactionLogEntry;
}

View File

@ -0,0 +1,20 @@
package com.iqser.red.service.redaction.v1.model;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data
public class RedactionLogEntry {
private String id;
private String type;
private String value;
private String reason;
private boolean redacted;
private String section;
private float[] color;
private List<Rectangle> positions = new ArrayList<>();
}

View File

@ -13,4 +13,6 @@ public class RedactionResult {
private byte[] document;
private int numberOfPages;
private RedactionLog redactionLog;
}

View File

@ -6,6 +6,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import lombok.Data;
@ -23,4 +24,6 @@ public class Document {
private StringFrequencyCounter fontCounter= new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private boolean headlines;
private List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
}

View File

@ -16,6 +16,7 @@ import lombok.NoArgsConstructor;
public class Paragraph {
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
private String headline;
public SearchableText getSearchableText(){
SearchableText searchableText = new SearchableText();

View File

@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RestController;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.resources.RedactionResource;
@ -48,10 +49,10 @@ public class RedactionController implements RedactionResource {
if (redactionRequest.isFlatRedaction()) {
PDDocument flatDocument = pdfFlattenService.flattenPDF(pdDocument);
return convert(flatDocument, classifiedDoc.getPages().size());
return convert(flatDocument, classifiedDoc.getPages().size(), new RedactionLog(classifiedDoc.getRedactionLogEntities()));
}
return convert(pdDocument, classifiedDoc.getPages().size());
return convert(pdDocument, classifiedDoc.getPages().size(), new RedactionLog(classifiedDoc.getRedactionLogEntities()));
} catch (IOException e) {
throw new RedactionException(e);
@ -124,16 +125,22 @@ public class RedactionController implements RedactionResource {
droolsExecutionService.updateRules(rules);
}
private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException {
return convert(document, numberOfPages, null);
}
private RedactionResult convert(PDDocument document, int numberOfPages, RedactionLog redactionLog) throws IOException {
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
document.save(byteArrayOutputStream);
return RedactionResult.builder()
.document(byteArrayOutputStream.toByteArray())
.numberOfPages(numberOfPages)
.redactionLog(redactionLog)
.build();
}
}
}
}

View File

@ -16,19 +16,24 @@ public class Entity {
private List<EntityPositionSequence> positionSequences = new ArrayList<>();
private Integer start;
private Integer end;
private String headline;
private int matchedRule;
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences) {
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule) {
this.word = word;
this.type = type;
this.redaction = redaction;
this.redactionReason = redactionReason;
this.positionSequences = positionSequences;
this.headline = headline;
this.matchedRule = matchedRule;
}
public Entity(String word, String type, Integer start, Integer end) {
public Entity(String word, String type, Integer start, Integer end, String headline) {
this.word = word;
this.type = type;
this.start = start;
this.end = end;
this.headline = headline;
}
}

View File

@ -23,6 +23,8 @@ public class Section {
//This does not contain linebreaks and must always be used for correct offsets.
private String searchText;
private String headline;
public boolean contains(String type) {
return entities.stream().anyMatch(entity -> entity.getType().equals(type));
}
@ -31,7 +33,8 @@ public class Section {
entities.forEach(entity -> {
if(entity.getType().equals(type)){
entity.setRedaction(true);
entity.setRedactionReason("\nRule " + ruleNumber + " matched\n\n" +reason);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
}
});
}
@ -40,7 +43,8 @@ public class Section {
entities.forEach(entity -> {
if(entity.getType().equals(type)){
entity.setRedaction(false);
entity.setRedactionReason("\nRule " + ruleNumber + " matched\n\n" +reason);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
}
});
}
@ -67,7 +71,8 @@ public class Section {
entities.forEach(entity -> {
if(entity.getType().equals(asType)){
entity.setRedaction(true);
entity.setRedactionReason("\nRule " + ruleNumber + " matched\n\n" +reason);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
}
});
@ -88,7 +93,8 @@ public class Section {
entities.forEach(entity -> {
if(entity.getType().equals(asType)){
entity.setRedaction(true);
entity.setRedactionReason("\nRule " + ruleNumber + " matched\n\n" +reason);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
}
});
}
@ -109,7 +115,7 @@ public class Section {
if (startIndex > -1 &&
(startIndex == 0 || Character.isWhitespace(searchText.charAt(startIndex - 1)) || isSeparator(searchText.charAt(startIndex - 1))) &&
(stopIndex == searchText.length() || isSeparator(searchText.charAt(stopIndex)))) {
found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex));
found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex, headline));
}
} while (startIndex > -1);

View File

@ -27,6 +27,7 @@ public class EntityRedactionService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
public void processDocument(Document classifiedDoc) {
dictionaryService.updateDictionary();
@ -55,12 +56,13 @@ public class EntityRedactionService {
}
}
Set<Entity> entities = findEntities(searchableText);
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline());
Section analysedSection = droolsExecutionService.executeRules(Section
.builder()
.entities(entities)
.text(searchableText.getAsStringWithLinebreaks())
.searchText(searchableText.toString())
.headline(paragraph.getHeadline())
.build());
for (Entity entity : analysedSection.getEntities()) {
@ -70,13 +72,14 @@ public class EntityRedactionService {
documentEntities.addAll(analysedSection.getEntities());
for (SearchableText searchableRow : searchableRows) {
Set<Entity> rowEntities = findEntities(searchableRow);
Set<Entity> rowEntities = findEntities(searchableRow, "//TODO TableHeader");
Section analysedRowSection = droolsExecutionService.executeRules(Section
.builder()
.entities(rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString())
.headline("//TODO TableHeader")
.build());
for (Entity entity : analysedRowSection.getEntities()) {
@ -89,13 +92,14 @@ public class EntityRedactionService {
documentEntities.forEach(entity -> {
entity.getPositionSequences().forEach(sequence -> {
classifiedDoc.getEntities().computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>()).add(
new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List.of(sequence))
new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List.of(sequence), entity.getHeadline(), entity.getMatchedRule())
);
});
});
}
private Set<Entity> findEntities(SearchableText searchableText) {
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
String normalizedInputString = searchableText.toString();
@ -111,7 +115,7 @@ public class EntityRedactionService {
if (startIndex > -1 &&
(startIndex == 0 || Character.isWhitespace(normalizedInputString.charAt(startIndex - 1)) || isSeparator(normalizedInputString.charAt(startIndex - 1))) &&
(stopIndex == normalizedInputString.length() || isSeparator(normalizedInputString.charAt(stopIndex)))) {
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex));
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex, headline));
}
} while (startIndex > -1);
}
@ -126,6 +130,7 @@ public class EntityRedactionService {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
}
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : entities) {
@ -138,4 +143,5 @@ public class EntityRedactionService {
entities.removeAll(wordsToRemove);
}
}
}

View File

@ -21,10 +21,11 @@ public class SectionsBuilderService {
public void buildSections(Document document) {
List<AbstractTextContainer> chunkWords = new ArrayList<>();
List<Paragraph> chunkBlockList1 = new ArrayList<>();
List<Paragraph> chunkBlockList = new ArrayList<>();
AbstractTextContainer prev = null;
String lastHeadline = "";
for (Page page : document.getPages()) {
for (AbstractTextContainer current : page.getTextBlocks()) {
@ -37,7 +38,9 @@ public class SectionsBuilderService {
if (prev != null && current.getClassification().startsWith("H ") || !document.isHeadlines()) {
Paragraph cb1 = buildTextBlock(chunkWords);
chunkBlockList1.add(cb1);
cb1.setHeadline(lastHeadline);
lastHeadline = current.getText();
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
}
@ -50,10 +53,11 @@ public class SectionsBuilderService {
Paragraph cb1 = buildTextBlock(chunkWords);
if (cb1 != null) {
chunkBlockList1.add(cb1);
chunkBlockList.add(cb1);
cb1.setHeadline(lastHeadline);
}
document.setParagraphs(chunkBlockList1);
document.setParagraphs(chunkBlockList);
}

View File

@ -20,6 +20,9 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
@ -77,6 +80,8 @@ public class AnnotationHighlightService {
for (Entity entity : classifiedDoc.getEntities().get(page)) {
RedactionLogEntry redactionLogEntry = new RedactionLogEntry();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
if (flatRedaction && !isRedactionType(entity)) {
@ -91,47 +96,42 @@ public class AnnotationHighlightService {
float posXEnd;
float posYInit;
float posYEnd;
float[] quadPoints;
if (textPositions.getTextPositions().get(0).getRotation() == 90) {
posXEnd = textPositions.getTextPositions().get(0).getYDirAdj() + 2;
posXInit = textPositions.getTextPositions().get(0).getYDirAdj() - height;
posYInit = textPositions.getTextPositions().get(0).getXDirAdj();
posYEnd = textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getXDirAdj() - height + 2;
quadPoints = new float[]{posXInit, posYInit, posXInit, posYEnd + height + 2, posXEnd, posYInit, posXEnd, posYEnd + height + 2};
posYEnd = textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getXDirAdj() - height + 4;
} else {
posXInit = textPositions.getTextPositions().get(0).getXDirAdj();
posXEnd = textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getXDirAdj() + textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getWidth() + 1;
posYInit = textPositions.getTextPositions().get(0).getPageHeight() - textPositions.getTextPositions().get(0).getYDirAdj();
posYEnd = textPositions.getTextPositions().get(0).getPageHeight() - textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getYDirAdj();
quadPoints = new float[]{posXInit, posYEnd + height + 2, posXEnd, posYEnd + height + 2, posXInit, posYInit - 2, posXEnd, posYEnd - 2};
posYInit = textPositions.getTextPositions().get(0).getPageHeight() - textPositions.getTextPositions().get(0).getYDirAdj() - 2;
posYEnd = textPositions.getTextPositions().get(0).getPageHeight() - textPositions.getTextPositions().get(textPositions.getTextPositions().size() - 1).getYDirAdj() + 2;
}
Rectangle textHighlightRectangle = new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
List<PDAnnotation> annotations = pdPage.getAnnotations();
PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
highlight.constructAppearances();
PDRectangle position = new PDRectangle();
position.setLowerLeftX(posXInit);
position.setLowerLeftY(posYEnd);
position.setUpperRightX(posXEnd);
position.setUpperRightY(posYEnd + height);
PDRectangle annotationPosition = new PDRectangle();
annotationPosition.setLowerLeftX(posXInit);
annotationPosition.setLowerLeftY(posYEnd);
annotationPosition.setUpperRightX(posXEnd);
annotationPosition.setUpperRightY(posYEnd + height);
highlight.setRectangle(position);
highlight.setRectangle(annotationPosition);
if (!flatRedaction) {
highlight.setAnnotationName(entityPositionSequence.getId().toString());
highlight.setTitlePopup(entityPositionSequence.getId().toString());
highlight.setContents(entity.getRedactionReason());
highlight.setContents("\nRule " + entity.getMatchedRule() + " matched\n\n" + entity.getRedactionReason() + "\n\n" + "In Section : \"" + entity.getHeadline() + "\"");
}
// quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
// of the area to be highlighted
highlight.setQuadPoints(quadPoints);
highlight.setQuadPoints(toQuadPoints(textHighlightRectangle));
PDColor color;
if (flatRedaction) {
@ -142,14 +142,37 @@ public class AnnotationHighlightService {
highlight.setColor(color);
annotations.add(highlight);
redactionLogEntry.getPositions().add(textHighlightRectangle);
}
redactionLogEntry.setId(entityPositionSequence.getId().toString());
}
redactionLogEntry.setColor(getColor(entity));
redactionLogEntry.setReason(entity.getRedactionReason());
redactionLogEntry.setValue(entity.getWord());
redactionLogEntry.setType(entity.getType());
redactionLogEntry.setRedacted(entity.isRedaction());
redactionLogEntry.setSection(entity.getHeadline());
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
}
}
}
private float[] toQuadPoints(Rectangle rectangle) {
// quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
// of the area to be highlighted
return new float[]{
rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY(),
rectangle.getTopLeft().getX() + rectangle.getWidth(), rectangle.getTopLeft().getY(),
rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(),
rectangle.getTopLeft().getX() + rectangle.getWidth(), rectangle.getTopLeft().getY() + rectangle.getHeight()};
}
private boolean isRedactionType(Entity entity) {
if (!entity.isRedaction()) {
return false;