RED-299: Redact complete Author(s) Field in Vertebrate study tables
This commit is contained in:
parent
564d74e39d
commit
664b9b4206
@ -1,6 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
@ -13,4 +15,29 @@ public class CellValue {
|
||||
|
||||
int rowSpanStart;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
|
||||
TextPositionSequence previous = null;
|
||||
for (TextPositionSequence word : textBlock.getSequences()) {
|
||||
|
||||
if (previous != null) {
|
||||
if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) {
|
||||
sb.append('\n');
|
||||
} else {
|
||||
sb.append(' ');
|
||||
}
|
||||
}
|
||||
sb.append(word.toString());
|
||||
previous = word;
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
|
||||
.replaceAll("\n", " ")
|
||||
.replaceAll(" {2}", " ");
|
||||
}
|
||||
|
||||
}
|
||||
@ -33,13 +33,14 @@ public class Section {
|
||||
private Map<String, CellValue> tabularData;
|
||||
|
||||
|
||||
public boolean rowEquals(String headerName, String value){
|
||||
String cleanHeaderName = headerName.replaceAll("\n", "")
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
public boolean rowEquals(String headerName, String value) {
|
||||
|
||||
return tabularData != null && tabularData.containsKey(cleanHeaderName)
|
||||
&& tabularData.get(cleanHeaderName).getTextBlock().getText().equals(value);
|
||||
String cleanHeaderName = headerName.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
|
||||
|
||||
return tabularData != null && tabularData.containsKey(cleanHeaderName) && tabularData.get(cleanHeaderName)
|
||||
.getTextBlock()
|
||||
.getText()
|
||||
.equals(value);
|
||||
}
|
||||
|
||||
|
||||
@ -172,25 +173,46 @@ public class Section {
|
||||
|
||||
public void highlightCell(String cellHeader, int ruleNumber, String type) {
|
||||
|
||||
String cleanHeaderName = cellHeader.replaceAll("\n", "")
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
annotateCell(cellHeader, ruleNumber, type, false, null);
|
||||
}
|
||||
|
||||
|
||||
public void redactCell(String cellHeader, int ruleNumber, String type, String reason) {
|
||||
|
||||
annotateCell(cellHeader, ruleNumber, type, true, reason);
|
||||
}
|
||||
|
||||
|
||||
public void redactNotCell(String cellHeader, int ruleNumber, String type, String reason) {
|
||||
|
||||
annotateCell(cellHeader, ruleNumber, type, false, reason);
|
||||
}
|
||||
|
||||
|
||||
private void annotateCell(String cellHeader, int ruleNumber, String type, boolean redact, String reason) {
|
||||
|
||||
String cleanHeaderName = cellHeader.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
|
||||
|
||||
CellValue value = tabularData.get(cleanHeaderName);
|
||||
if (value == null) {
|
||||
log.warn("Could not find any data for {}.", cellHeader);
|
||||
} else {
|
||||
Entity entity = new Entity(value.getTextBlock()
|
||||
.getText(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.getTextBlock()
|
||||
.getText()
|
||||
Entity entity = new Entity(value.toString(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.toString()
|
||||
.length(), headline, sectionNumber);
|
||||
entity.setRedaction(false);
|
||||
entity.setRedaction(redact);
|
||||
entity.setMatchedRule(ruleNumber);
|
||||
entity.setRedactionReason(cellHeader);
|
||||
entity.setTargetSequences(value.getTextBlock().getSequences()); // Make sure no other cells with same content are highlighted
|
||||
entities.add(entity);
|
||||
}
|
||||
entity.setRedactionReason(reason);
|
||||
entity.setTargetSequences(value.getTextBlock()
|
||||
.getSequences()); // Make sure no other cells with same content are highlighted
|
||||
|
||||
// HashSet keeps the older value, but we want the new only.
|
||||
if(entities.contains(entity)){
|
||||
entities.remove(entity);
|
||||
}
|
||||
entities.add(entity);
|
||||
|
||||
entities = removeEntitiesContainedInLarger(entities);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -72,7 +72,7 @@ public class EntityRedactionService {
|
||||
.replaceAll("-", "");
|
||||
tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart));
|
||||
});
|
||||
start = start + cell.getTextBlocks().get(0).toString().length();
|
||||
start = start + cell.toString().length();
|
||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
searchableRow.addAll(textBlock.getSequences());
|
||||
}
|
||||
|
||||
@ -42,7 +42,9 @@ public class SectionsBuilderService {
|
||||
if (prev != null && current.getClassification().startsWith("H ") || !document.isHeadlines()) {
|
||||
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
lastHeadline = current.getText();
|
||||
if(document.isHeadlines()) {
|
||||
lastHeadline = current.getText();
|
||||
}
|
||||
chunkBlockList.add(chunkBlock);
|
||||
chunkWords = new ArrayList<>();
|
||||
if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) {
|
||||
|
||||
@ -5,6 +5,8 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
@ -20,10 +22,10 @@ public class Cell extends Rectangle {
|
||||
|
||||
private boolean isHeaderCell;
|
||||
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()),
|
||||
(float) (bottomRight
|
||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight
|
||||
.getY() - topLeft.getY()));
|
||||
}
|
||||
|
||||
@ -33,4 +35,34 @@ public class Cell extends Rectangle {
|
||||
textBlocks.add(textBlock);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
TextPositionSequence previous = null;
|
||||
for (TextBlock textBlock : textBlocks) {
|
||||
|
||||
|
||||
for (TextPositionSequence word : textBlock.getSequences()) {
|
||||
|
||||
if (previous != null) {
|
||||
if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) {
|
||||
sb.append('\n');
|
||||
} else {
|
||||
sb.append(' ');
|
||||
}
|
||||
}
|
||||
sb.append(word.toString());
|
||||
previous = word;
|
||||
}
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
|
||||
.replaceAll("\n", " ")
|
||||
.replaceAll(" {2}", " ");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -101,7 +101,7 @@ rule "8: Not redacted because Vertebrate Study = N"
|
||||
when
|
||||
Section(rowEquals("Vertebrate study Y/N", "N") || rowEquals("Vertebrate study Y/N", "No"))
|
||||
then
|
||||
section.redactNot("name", 8, "Not redacted because row is not a vertebrate study");
|
||||
section.redactNotCell("Author(s)", 8, "name", "Not redacted because row is not a vertebrate study");
|
||||
section.redactNot("address", 8, "Not redacted because row is not a vertebrate study");
|
||||
section.highlightCell("Vertebrate study Y/N", 8, "hint_only");
|
||||
end
|
||||
@ -120,7 +120,7 @@ rule "10: Redact Authors and Addresses in Reference Table if it is a Vertebrate
|
||||
when
|
||||
Section(rowEquals("Vertebrate study Y/N", "Y") || rowEquals("Vertebrate study Y/N", "Yes"))
|
||||
then
|
||||
section.redact("name", 10, "Redacted because row is a vertebrate study");
|
||||
section.redactCell("Author(s)", 10, "name", "Redacted because row is a vertebrate study");
|
||||
section.redact("address", 10, "Redacted because row is a vertebrate study");
|
||||
section.highlightCell("Vertebrate study Y/N", 10, "must_redact");
|
||||
end
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user