RED-299: Redact complete Author(s) Field in Vertebrate study tables

This commit is contained in:
deiflaender 2020-09-23 15:20:42 +02:00
parent 564d74e39d
commit 664b9b4206
6 changed files with 107 additions and 24 deletions

View File

@ -1,6 +1,8 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.RequiredArgsConstructor;
import lombok.Value;
@ -13,4 +15,29 @@ public class CellValue {
int rowSpanStart;
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
}
sb.append(word.toString());
previous = word;
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
.replaceAll("\n", " ")
.replaceAll(" {2}", " ");
}
}

View File

@ -33,13 +33,14 @@ public class Section {
private Map<String, CellValue> tabularData;
public boolean rowEquals(String headerName, String value){
String cleanHeaderName = headerName.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
public boolean rowEquals(String headerName, String value) {
return tabularData != null && tabularData.containsKey(cleanHeaderName)
&& tabularData.get(cleanHeaderName).getTextBlock().getText().equals(value);
String cleanHeaderName = headerName.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
return tabularData != null && tabularData.containsKey(cleanHeaderName) && tabularData.get(cleanHeaderName)
.getTextBlock()
.getText()
.equals(value);
}
@ -172,25 +173,46 @@ public class Section {
public void highlightCell(String cellHeader, int ruleNumber, String type) {
String cleanHeaderName = cellHeader.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
annotateCell(cellHeader, ruleNumber, type, false, null);
}
public void redactCell(String cellHeader, int ruleNumber, String type, String reason) {
annotateCell(cellHeader, ruleNumber, type, true, reason);
}
public void redactNotCell(String cellHeader, int ruleNumber, String type, String reason) {
annotateCell(cellHeader, ruleNumber, type, false, reason);
}
private void annotateCell(String cellHeader, int ruleNumber, String type, boolean redact, String reason) {
String cleanHeaderName = cellHeader.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
CellValue value = tabularData.get(cleanHeaderName);
if (value == null) {
log.warn("Could not find any data for {}.", cellHeader);
} else {
Entity entity = new Entity(value.getTextBlock()
.getText(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.getTextBlock()
.getText()
Entity entity = new Entity(value.toString(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.toString()
.length(), headline, sectionNumber);
entity.setRedaction(false);
entity.setRedaction(redact);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(cellHeader);
entity.setTargetSequences(value.getTextBlock().getSequences()); // Make sure no other cells with same content are highlighted
entities.add(entity);
}
entity.setRedactionReason(reason);
entity.setTargetSequences(value.getTextBlock()
.getSequences()); // Make sure no other cells with same content are highlighted
// HashSet keeps the older value, but we want the new only.
if(entities.contains(entity)){
entities.remove(entity);
}
entities.add(entity);
entities = removeEntitiesContainedInLarger(entities);
}
}
}

View File

@ -72,7 +72,7 @@ public class EntityRedactionService {
.replaceAll("-", "");
tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart));
});
start = start + cell.getTextBlocks().get(0).toString().length();
start = start + cell.toString().length();
for (TextBlock textBlock : cell.getTextBlocks()) {
searchableRow.addAll(textBlock.getSequences());
}

View File

@ -42,7 +42,9 @@ public class SectionsBuilderService {
if (prev != null && current.getClassification().startsWith("H ") || !document.isHeadlines()) {
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
lastHeadline = current.getText();
if(document.isHeadlines()) {
lastHeadline = current.getText();
}
chunkBlockList.add(chunkBlock);
chunkWords = new ArrayList<>();
if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) {

View File

@ -5,6 +5,8 @@ import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;
@ -20,10 +22,10 @@ public class Cell extends Rectangle {
private boolean isHeaderCell;
public Cell(Point2D topLeft, Point2D bottomRight) {
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()),
(float) (bottomRight
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight
.getY() - topLeft.getY()));
}
@ -33,4 +35,34 @@ public class Cell extends Rectangle {
textBlocks.add(textBlock);
}
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;
for (TextBlock textBlock : textBlocks) {
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
}
sb.append(word.toString());
previous = word;
}
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
.replaceAll("\n", " ")
.replaceAll(" {2}", " ");
}
}

View File

@ -101,7 +101,7 @@ rule "8: Not redacted because Vertebrate Study = N"
when
Section(rowEquals("Vertebrate study Y/N", "N") || rowEquals("Vertebrate study Y/N", "No"))
then
section.redactNot("name", 8, "Not redacted because row is not a vertebrate study");
section.redactNotCell("Author(s)", 8, "name", "Not redacted because row is not a vertebrate study");
section.redactNot("address", 8, "Not redacted because row is not a vertebrate study");
section.highlightCell("Vertebrate study Y/N", 8, "hint_only");
end
@ -120,7 +120,7 @@ rule "10: Redact Authors and Addresses in Reference Table if it is a Vertebrate
when
Section(rowEquals("Vertebrate study Y/N", "Y") || rowEquals("Vertebrate study Y/N", "Yes"))
then
section.redact("name", 10, "Redacted because row is a vertebrate study");
section.redactCell("Author(s)", 10, "name", "Redacted because row is a vertebrate study");
section.redact("address", 10, "Redacted because row is a vertebrate study");
section.highlightCell("Vertebrate study Y/N", 10, "must_redact");
end