RED-882: Added textBefore and textAfter to redaction log

This commit is contained in:
Dominique Eifländer 2020-12-18 14:29:40 +01:00
parent add196f913
commit caf6277de9
13 changed files with 193 additions and 127 deletions

View File

@ -34,4 +34,7 @@ public class RedactionLogEntry {
private ManualRedactionType manualRedactionType;
private boolean isDictionaryEntry;
private String textBefore;
private String textAfter;
}

View File

@ -24,7 +24,6 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.visualization.service.AnnotationHighlightService;
import com.iqser.red.service.redaction.v1.server.visualization.service.PdfFlattenService;
import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService;
import lombok.RequiredArgsConstructor;
@ -37,7 +36,6 @@ public class RedactionController implements RedactionResource {
private final PdfSegmentationService pdfSegmentationService;
private final AnnotationHighlightService annotationHighlightService;
private final EntityRedactionService entityRedactionService;
private final PdfFlattenService pdfFlattenService;
private final DroolsExecutionService droolsExecutionService;
@ -52,12 +50,6 @@ public class RedactionController implements RedactionResource {
annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction(), redactionRequest
.getManualRedactions());
if (redactionRequest.isFlatRedaction()) {
PDDocument flatDocument = pdfFlattenService.flattenPDF(pdDocument);
return convert(flatDocument, classifiedDoc.getPages()
.size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid(), classifiedDoc.getDictionaryVersion(), classifiedDoc.getRulesVersion());
}
return convert(pdDocument, classifiedDoc.getPages()
.size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid(), classifiedDoc.getDictionaryVersion(), classifiedDoc.getRulesVersion());

View File

@ -35,8 +35,11 @@ public class Entity {
private boolean isDictionaryEntry;
private String textBefore;
private String textAfter;
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry) {
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter) {
this.word = word;
this.type = type;
@ -48,6 +51,8 @@ public class Entity {
this.sectionNumber = sectionNumber;
this.legalBasis = legalBasis;
this.isDictionaryEntry = isDictionaryEntry;
this.textBefore = textBefore;
this.textAfter = textAfter;
}

View File

@ -41,6 +41,7 @@ public class EntityRedactionService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
private final SurroundingWordsService surroundingWordsService;
public void processDocument(Document classifiedDoc, ManualRedactions manualRedactions) {
@ -83,7 +84,7 @@ public class EntityRedactionService {
.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry()));
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter()));
}
}
@ -94,9 +95,8 @@ public class EntityRedactionService {
}
private Set<Entity> findEntities(Document classifiedDoc, ManualRedactions manualRedactions,
Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
private Set<Entity> findEntities(Document classifiedDoc, ManualRedactions manualRedactions, Dictionary dictionary,
boolean local, Map<Integer, Set<Entity>> hintsPerSectionNumber) {
Set<Entity> documentEntities = new HashSet<>();
int sectionNumber = 1;
@ -113,6 +113,7 @@ public class EntityRedactionService {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
for (Cell cell : row) {
if (!singleCellTable && cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
@ -128,12 +129,17 @@ public class EntityRedactionService {
.replaceAll("-", "");
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
start = start + cell.toString().length() + 1;
for (TextBlock textBlock : cell.getTextBlocks()) {
// TODO avoid cell overlap merging.
searchableRow.addAll(textBlock.getSequences());
}
cellStarts.add(cellStart);
start = start + cell.toString().trim().length() + 1;
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary.getDictionaryModels(), local);
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary
.getDictionaryModels(), local);
surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
@ -155,6 +161,7 @@ public class EntityRedactionService {
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber);
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary.getDictionaryModels(), local);
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
@ -176,20 +183,20 @@ public class EntityRedactionService {
documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), sectionSearchableTextPair.getSearchableText(), dictionary));
analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> {
if (dictionary.isRecommendation(key)){
if (dictionary.isRecommendation(key)) {
analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> {
if (!dictionary.containsValue(key, value)){
if (!dictionary.containsValue(key, value)) {
dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value);
}
});
} else {
analysedRowSection.getLocalDictionaryAdds().get(key).forEach( value -> {
analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> {
if(dictionary.getLocalAccessMap().get(key) == null){
if (dictionary.getLocalAccessMap().get(key) == null) {
log.warn("Dictionary {} is null", key);
}
if(dictionary.getLocalAccessMap().get(key).getLocalEntries() == null){
if (dictionary.getLocalAccessMap().get(key).getLocalEntries() == null) {
log.warn("Dictionary {} localEntries is null", key);
}
@ -198,7 +205,6 @@ public class EntityRedactionService {
}
});
});
return documentEntities;
@ -243,13 +249,14 @@ public class EntityRedactionService {
}
private Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber, boolean local) {
private Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber,
boolean local) {
Set<Entity> found = new HashSet<>();
for (String value : values) {
if(value.trim().length() <= 2) {
if (value.trim().length() <= 2) {
continue;
}

View File

@ -0,0 +1,140 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.List;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class SurroundingWordsService {
private final RedactionServiceSettings redactionServiceSettings;
public void addSurroundingText(Set<Entity> entities, SearchableText searchableText, Dictionary dictionary) {
if (entities.isEmpty()) {
return;
}
try {
for (Entity entity : entities) {
if (dictionary.isHint(entity.getType())) {
continue;
}
findSurroundingWords(entity, searchableText.toString(), entity.getStart(), entity.getEnd());
}
} catch (Exception e) {
log.warn("Could not get surrounding text!");
}
}
public void addSurroundingText(Set<Entity> entities, SearchableText searchableText, Dictionary dictionary,
List<Integer> cellstarts) {
if (entities.isEmpty()) {
return;
}
try {
String searchableString = searchableText.toString();
if (cellstarts != null) {
for (int i = 0; i < cellstarts.size(); i++) {
int startOffset = cellstarts.get(i);
int endOffset = -1;
if (i + 1 < cellstarts.size()) {
endOffset = cellstarts.get(i + 1);
} else {
endOffset = searchableString.length() - 1;
}
String text = searchableString.substring(startOffset, endOffset);
for (Entity entity : entities) {
if (dictionary.isHint(entity.getType())) {
continue;
}
if (entity.getStart() >= startOffset && entity.getEnd() <= endOffset) {
int entityStartOffset = entity.getStart() - startOffset;
int entityEndOffset = entity.getEnd() - startOffset;
findSurroundingWords(entity, text, entityStartOffset, entityEndOffset);
}
}
}
}
} catch (Exception e) {
log.warn("Could not get surrounding text!");
}
}
private void findSurroundingWords(Entity entity, String text, int entityStartOffset, int entityEndOffset) {
int offsetBefore = entityStartOffset - redactionServiceSettings.getSurroundingWordsOffsetWindow() < 0 ? 0 : entityStartOffset - redactionServiceSettings
.getSurroundingWordsOffsetWindow();
String textBefore = text.substring(offsetBefore, entityStartOffset);
if (!textBefore.isBlank()) {
String[] wordsBefore = textBefore.split(" ");
int numberOfWordsBefore = wordsBefore.length > redactionServiceSettings.getNumberOfSurroundingWords() ? redactionServiceSettings
.getNumberOfSurroundingWords() : wordsBefore.length;
if (wordsBefore.length > 0) {
entity.setTextBefore(concatWordsBefore(wordsBefore, numberOfWordsBefore));
}
}
int endOffset = entityEndOffset + redactionServiceSettings.getSurroundingWordsOffsetWindow() > text.length() ? text
.length() : entityEndOffset + redactionServiceSettings.getSurroundingWordsOffsetWindow();
String textAfter = text.substring(entityEndOffset, endOffset);
if (!textAfter.isBlank()) {
String[] wordsAfter = textAfter.split(" ");
int numberOfWordsAfter = wordsAfter.length > redactionServiceSettings.getNumberOfSurroundingWords() ? redactionServiceSettings
.getNumberOfSurroundingWords() : wordsAfter.length;
if (wordsAfter.length > 0) {
entity.setTextAfter(concatWordsAfter(wordsAfter, numberOfWordsAfter));
}
}
}
private String concatWordsBefore(String[] words, int number) {
StringBuilder sb = new StringBuilder();
int startNumber = words.length > number ? words.length - number : 0;
for (int i = startNumber; i < words.length; i++) {
sb.append(words[i]).append(" ");
}
return sb.toString().trim();
}
private String concatWordsAfter(String[] words, int number) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < number; i++) {
sb.append(words[i]).append(" ");
}
return sb.toString().trim();
}
}

View File

@ -11,7 +11,7 @@ public class TextNormalizationUtilities {
* @return Text without line-break hyphenation.
*/
public static String removeHyphenLineBreaks(String text) {
return text.replaceAll("\\s(\\S+)[\\-\\u00AD]\\R|\n\r(.+ )", "\n$1$2");
return text.replaceAll("([^\\s\\d\\-]{2,})[\\-\\u00AD]\\R|\n\r(.+ )", "$1$2");
}
}

View File

@ -7,12 +7,9 @@ import lombok.Data;
@Data
@ConfigurationProperties("redaction-service")
public class RedactionServiceSettings {
private int numberOfSurroundingWords = 3;
/**
* Tenant used in single tenant mode.
*/
private String defaultTenant = "iqser-id";
private int flattenImageDpi = 100;
private int surroundingWordsOffsetWindow = 100;
}

View File

@ -42,10 +42,13 @@ public class Cell extends Rectangle {
StringBuilder sb = new StringBuilder();
Iterator<TextBlock> itty = textBlocks.iterator();
TextPositionSequence previous = null;
while (itty.hasNext()) {
TextBlock textBlock = itty.next();
TextPositionSequence previous = null;
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
@ -57,9 +60,7 @@ public class Cell extends Rectangle {
sb.append(word.toString());
previous = word;
}
if (itty.hasNext()) {
sb.append(' ');
}
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
@ -67,4 +68,8 @@ public class Cell extends Rectangle {
.replaceAll(" {2}", " ");
}
}

View File

@ -276,6 +276,8 @@ public class AnnotationHighlightService {
.sectionNumber(entity.getSectionNumber())
.matchedRule(entity.getMatchedRule())
.isDictionaryEntry(entity.isDictionaryEntry())
.textAfter(entity.getTextAfter())
.textBefore(entity.getTextBefore())
.build();
}

View File

@ -1,68 +0,0 @@
package com.iqser.red.service.redaction.v1.server.visualization.service;
import java.awt.image.BufferedImage;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class PdfFlattenService {
private final RedactionServiceSettings settings;
public PDDocument flattenPDF(PDDocument sourceDoc) throws IOException {
PDDocument destDoc = new PDDocument();
PDFRenderer pdfRenderer = new PDFRenderer(sourceDoc);
final int pageCount = sourceDoc.getDocumentCatalog().getPages().getCount();
log.info(pageCount + " page" + (pageCount == 1 ? "" : "s") + " to flatten.");
for (int i = 0; i < pageCount; i += 1) {
log.info("Flattening page " + (i + 1) + " of " + pageCount + "...");
BufferedImage img = pdfRenderer.renderImageWithDPI(i, settings.getFlattenImageDpi(), ImageType.RGB);
log.info("Image rendered in memory (" + img.getWidth() + "x" + img.getHeight() + " " + settings.getFlattenImageDpi() + "DPI). Adding to PDF...");
PDPage imagePage = new PDPage(new PDRectangle(img.getWidth(), img.getHeight()));
destDoc.addPage(imagePage);
PDImageXObject imgObj = LosslessFactory.createFromImage(destDoc, img);
PDPageContentStream imagePageContentStream = new PDPageContentStream(destDoc, imagePage);
imagePageContentStream.drawImage(imgObj, 0, 0);
log.info("Image added successfully.");
imagePageContentStream.close();
img.flush();
}
log.info("New flattened PDF created in memory.");
sourceDoc.close();
return destDoc;
}
}

View File

@ -382,7 +382,7 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -391,11 +391,11 @@ public class RedactionIntegrationTest {
RedactionResult result = redactionController.redact(request);
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
if(entry.isDictionaryEntry()){
System.out.println(entry.getValue());
}
});
// result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
// if(!entry.isHint()){
// System.out.println(entry.getPositions().get(0).getPage() +":"+ entry.getTextBefore() +"--->"+ entry.getValue() + "--->" + entry.getTextAfter());
// }
// });
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Redacted.pdf")) {
fileOutputStream.write(result.getDocument());

View File

@ -1,17 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TextNormalizationUtilities {
/**
* Revert hyphenation due to line breaks.
* @param text Text to be processed.
* @return Text without line-break hyphenation.
*/
public static String removeHyphenLineBreaks(String text) {
return text.replaceAll("\\s(\\S+)[\\-\\u00AD]\\R|\n\r(.+ )", "\n$1$2");
}
}

View File

@ -10,11 +10,11 @@ public class TextNormalizationUtilitiesTest {
String test = "Without these peo-\nple, this conference would not happen";
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLineBreaks(test))
.contains("\npeople");
.contains("people");
test = "Die\t\nFreiwillige\t Versicherung\t endet\t zudem\t für\t den\t ein\u00AD\nzelnen\tVersicherten\tmit\tder\tAufhebung\tdes\tVertra-\nges,\t seiner\t Unterstellung\t unter\t die\t obligatorische\t\nVersicherung\t oder\t seinem\t Ausschluss.";
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLineBreaks(test))
.contains("\neinzelnen", "\nVertrages");
.contains("einzelnen", "Vertrages");
}