From 664b9b420605cb1adf1113b34c52a45e8024dbbc Mon Sep 17 00:00:00 2001 From: deiflaender Date: Wed, 23 Sep 2020 15:20:42 +0200 Subject: [PATCH] RED-299: Redact complete Author(s) Field in Vertebrate study tables --- .../v1/server/redaction/model/CellValue.java | 27 +++++++++ .../v1/server/redaction/model/Section.java | 56 +++++++++++++------ .../service/EntityRedactionService.java | 2 +- .../segmentation/SectionsBuilderService.java | 4 +- .../v1/server/tableextraction/model/Cell.java | 38 ++++++++++++- .../src/test/resources/drools/rules.drl | 4 +- 6 files changed, 107 insertions(+), 24 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java index e646cbef..632de0fc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java @@ -1,6 +1,8 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import lombok.RequiredArgsConstructor; import lombok.Value; @@ -13,4 +15,29 @@ public class CellValue { int rowSpanStart; + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + + + TextPositionSequence previous = null; + for (TextPositionSequence word : textBlock.getSequences()) { + + if (previous != null) { + if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) { + sb.append('\n'); + } else { + sb.append(' '); + } + } + sb.append(word.toString()); + previous = word; + } + + return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()) + .replaceAll("\n", " ") + .replaceAll(" {2}", " "); + } + } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index feee9f99..dc14e5a8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -33,13 +33,14 @@ public class Section { private Map tabularData; - public boolean rowEquals(String headerName, String value){ - String cleanHeaderName = headerName.replaceAll("\n", "") - .replaceAll(" ", "") - .replaceAll("-", ""); + public boolean rowEquals(String headerName, String value) { - return tabularData != null && tabularData.containsKey(cleanHeaderName) - && tabularData.get(cleanHeaderName).getTextBlock().getText().equals(value); + String cleanHeaderName = headerName.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", ""); + + return tabularData != null && tabularData.containsKey(cleanHeaderName) && tabularData.get(cleanHeaderName) + .getTextBlock() + .getText() + .equals(value); } @@ -172,25 +173,46 @@ public class Section { public void highlightCell(String cellHeader, int ruleNumber, String type) { - String cleanHeaderName = cellHeader.replaceAll("\n", "") - .replaceAll(" ", "") - .replaceAll("-", ""); + annotateCell(cellHeader, ruleNumber, type, false, null); + } + + + public void redactCell(String cellHeader, int ruleNumber, String type, String reason) { + + annotateCell(cellHeader, ruleNumber, type, true, reason); + } + + + public void redactNotCell(String cellHeader, int ruleNumber, String type, String reason) { + + annotateCell(cellHeader, ruleNumber, type, false, reason); + } + + + private void annotateCell(String cellHeader, int ruleNumber, String type, boolean redact, String reason) { + + String cleanHeaderName = cellHeader.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", ""); CellValue value = tabularData.get(cleanHeaderName); if (value == null) { log.warn("Could not find any data for {}.", cellHeader); } else { - Entity entity = new Entity(value.getTextBlock() - .getText(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.getTextBlock() - .getText() + Entity entity = new Entity(value.toString(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.toString() .length(), headline, sectionNumber); - entity.setRedaction(false); + entity.setRedaction(redact); entity.setMatchedRule(ruleNumber); - entity.setRedactionReason(cellHeader); - entity.setTargetSequences(value.getTextBlock().getSequences()); // Make sure no other cells with same content are highlighted - entities.add(entity); - } + entity.setRedactionReason(reason); + entity.setTargetSequences(value.getTextBlock() + .getSequences()); // Make sure no other cells with same content are highlighted + // HashSet keeps the older value, but we want the new only. + if(entities.contains(entity)){ + entities.remove(entity); + } + entities.add(entity); + + entities = removeEntitiesContainedInLarger(entities); + } } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index b41f1123..165786fe 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -72,7 +72,7 @@ public class EntityRedactionService { .replaceAll("-", ""); tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart)); }); - start = start + cell.getTextBlocks().get(0).toString().length(); + start = start + cell.toString().length(); for (TextBlock textBlock : cell.getTextBlocks()) { searchableRow.addAll(textBlock.getSequences()); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 4862ac0d..3b591643 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -42,7 +42,9 @@ public class SectionsBuilderService { if (prev != null && current.getClassification().startsWith("H ") || !document.isHeadlines()) { Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline); chunkBlock.setHeadline(lastHeadline); - lastHeadline = current.getText(); + if(document.isHeadlines()) { + lastHeadline = current.getText(); + } chunkBlockList.add(chunkBlock); chunkWords = new ArrayList<>(); if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java index 6b884f69..e6989939 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java @@ -5,6 +5,8 @@ import java.util.ArrayList; import java.util.List; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import lombok.Data; import lombok.EqualsAndHashCode; @@ -20,10 +22,10 @@ public class Cell extends Rectangle { private boolean isHeaderCell; + public Cell(Point2D topLeft, Point2D bottomRight) { - super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), - (float) (bottomRight + super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight .getY() - topLeft.getY())); } @@ -33,4 +35,34 @@ public class Cell extends Rectangle { textBlocks.add(textBlock); } -} \ No newline at end of file + + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + + TextPositionSequence previous = null; + for (TextBlock textBlock : textBlocks) { + + + for (TextPositionSequence word : textBlock.getSequences()) { + + if (previous != null) { + if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) { + sb.append('\n'); + } else { + sb.append(' '); + } + } + sb.append(word.toString()); + previous = word; + } + } + + return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()) + .replaceAll("\n", " ") + .replaceAll(" {2}", " "); + } + +} + diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 2e9cab7b..2490aad2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -101,7 +101,7 @@ rule "8: Not redacted because Vertebrate Study = N" when Section(rowEquals("Vertebrate study Y/N", "N") || rowEquals("Vertebrate study Y/N", "No")) then - section.redactNot("name", 8, "Not redacted because row is not a vertebrate study"); + section.redactNotCell("Author(s)", 8, "name", "Not redacted because row is not a vertebrate study"); section.redactNot("address", 8, "Not redacted because row is not a vertebrate study"); section.highlightCell("Vertebrate study Y/N", 8, "hint_only"); end @@ -120,7 +120,7 @@ rule "10: Redact Authors and Addresses in Reference Table if it is a Vertebrate when Section(rowEquals("Vertebrate study Y/N", "Y") || rowEquals("Vertebrate study Y/N", "Yes")) then - section.redact("name", 10, "Redacted because row is a vertebrate study"); + section.redactCell("Author(s)", 10, "name", "Redacted because row is a vertebrate study"); section.redact("address", 10, "Redacted because row is a vertebrate study"); section.highlightCell("Vertebrate study Y/N", 10, "must_redact"); end