diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java index 776ae1b0..7f7493b8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java @@ -3,6 +3,8 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import java.util.ArrayList; import java.util.List; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; + import lombok.Data; import lombok.EqualsAndHashCode; @@ -16,6 +18,7 @@ public class Entity { private boolean redaction; private String redactionReason; private List positionSequences = new ArrayList<>(); + private List targetSequences; private Integer start; private Integer end; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index 41ee4548..8652bf57 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -1,6 +1,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.regex.Pattern; @@ -25,9 +26,16 @@ public class SearchableText { } - @SuppressWarnings("checkstyle:ModifiedControlVariable") public List getSequences(String searchString, boolean caseInsensitive) { + return getSequences(searchString, caseInsensitive, null); + + } + + @SuppressWarnings("checkstyle:ModifiedControlVariable") + public List getSequences(String searchString, boolean caseInsensitive, + List sequencesSubList) { + String normalizedSearchString; if (caseInsensitive) { normalizedSearchString = searchString.toLowerCase(); @@ -40,37 +48,50 @@ public class SearchableText { List crossSequenceParts = new ArrayList<>(); List finalMatches = new ArrayList<>(); - for (int i = 0; i < sequences.size(); i++) { - TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage()); - for (int j = 0; j < sequences.get(i).length(); j++) { - if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInsensitive) == ' ' && sequences.get(i - 1) - .charAt(sequences.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && sequences.get(i) - .charAt(j, caseInsensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInsensitive) == ' ') { - if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) { + List searchSpace; + if (sequencesSubList != null) { + int subListIndex = Collections.indexOfSubList(sequences, sequencesSubList); + if (subListIndex != -1) { + searchSpace = sequences.subList(subListIndex, subListIndex + sequencesSubList.size()); + } else { + searchSpace = sequences; + } + } else { + searchSpace = sequences; + } + + for (int i = 0; i < searchSpace.size(); i++) { + TextPositionSequence partMatch = new TextPositionSequence(searchSpace.get(i).getPage()); + for (int j = 0; j < searchSpace.get(i).length(); j++) { + + if (i > 0 && j == 0 && searchSpace.get(i).charAt(0, caseInsensitive) == ' ' && searchSpace.get(i - 1) + .charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && searchSpace.get(i) + .charAt(j, caseInsensitive) == ' ' && searchSpace.get(i).charAt(j - 1, caseInsensitive) == ' ') { + if (j == searchSpace.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) { crossSequenceParts.add(partMatch); } continue; } - if (j == 0 && sequences.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && sequences.get(i - 1) - .charAt(sequences.get(i - 1) + if (j == 0 && searchSpace.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && searchSpace.get(i - 1) + .charAt(searchSpace.get(i - 1) .length() - 1, caseInsensitive) != ' ' && searchChars[counter] == ' ') { counter++; } - if (sequences.get(i) - .charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && sequences.get(i) + if (searchSpace.get(i) + .charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && searchSpace.get(i) .charAt(j, caseInsensitive) == '-') { - if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i) - .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1) - .charAt(sequences.get(i - 1) - .length() - 1, caseInsensitive)) || j == 0 && i != 0 && sequences.get(i - 1) - .charAt(sequences.get(i - 1).length() - 1, caseInsensitive) != ' ' && sequences.get(i) + if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(searchSpace.get(i) + .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(searchSpace.get(i - 1) + .charAt(searchSpace.get(i - 1) + .length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1) + .charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i) .charAt(j, caseInsensitive) != ' ') { - partMatch.add(sequences.get(i).textPositionAt(j)); - if (!(j == sequences.get(i).length() - 1 && sequences.get(i) + partMatch.add(searchSpace.get(i).textPositionAt(j)); + if (!(j == searchSpace.get(i).length() - 1 && searchSpace.get(i) .charAt(j, caseInsensitive) == '-' && searchChars[counter] != '-')) { counter++; } @@ -79,19 +100,19 @@ public class SearchableText { if (counter == searchString.length()) { crossSequenceParts.add(partMatch); - if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i) - .length() - 1 && isSeparator(sequences.get(i) - .charAt(j + 1, caseInsensitive)) || j == sequences.get(i) - .length() - 1 && isSeparator(sequences.get(i + 1) - .charAt(0, caseInsensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i) - .charAt(j, caseInsensitive) != ' ' && sequences.get(i + 1) + if (i == searchSpace.size() - 1 && j == searchSpace.get(i).length() - 1 || j != searchSpace.get(i) + .length() - 1 && isSeparator(searchSpace.get(i) + .charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i) + .length() - 1 && isSeparator(searchSpace.get(i + 1) + .charAt(0, caseInsensitive)) || j == searchSpace.get(i).length() - 1 && searchSpace.get(i) + .charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1) .charAt(0, caseInsensitive) != ' ') { finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts)); } counter = 0; crossSequenceParts = new ArrayList<>(); - partMatch = new TextPositionSequence(sequences.get(i).getPage()); + partMatch = new TextPositionSequence(searchSpace.get(i).getPage()); } } else { counter = 0; @@ -99,16 +120,17 @@ public class SearchableText { j--; } crossSequenceParts = new ArrayList<>(); - partMatch = new TextPositionSequence(sequences.get(i).getPage()); + partMatch = new TextPositionSequence(searchSpace.get(i).getPage()); } - if (j == sequences.get(i).length() - 1 && counter != 0) { + if (j == searchSpace.get(i).length() - 1 && counter != 0) { crossSequenceParts.add(partMatch); } } } return finalMatches; + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index dd6f9419..ca571a84 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -7,9 +7,10 @@ import java.util.Map; import java.util.Set; import java.util.regex.Pattern; -import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; + import lombok.Builder; import lombok.Data; import lombok.extern.slf4j.Slf4j; @@ -31,7 +32,21 @@ public class Section { private int sectionNumber; - private Map tabularData; + private Map tabularData; + + + public boolean isVertebrateStudy() { + return tabularData != null + && tabularData.containsKey("Vertebrate study Y/N") + && tabularData.get("Vertebrate study Y/N").getText().equals("Y"); + } + + + public boolean isNotVertebrateStudy() { + return tabularData != null + && tabularData.containsKey("Vertebrate study Y/N") + && tabularData.get("Vertebrate study Y/N").getText().equals("N"); + } public boolean contains(String type) { @@ -163,20 +178,16 @@ public class Section { public void highlightCell(String cellHeader, int ruleNumber) { - String value = tabularData.get(cellHeader); + TextBlock value = tabularData.get(cellHeader); if (value == null) { log.warn("Could not find any data for {}.", cellHeader); } else { - Set found = findEntities(value, "must_redact"); - if (CollectionUtils.isEmpty(found)) { - log.warn("Could not identify value {} in row.", value); - } else { - Entity entity = found.iterator().next(); - entity.setRedaction(false); - entity.setMatchedRule(ruleNumber); - entity.setRedactionReason(cellHeader); - entities.add(entity); - } + Entity entity = new Entity(value.getText(), "must_redact", 0, value.getText().length(), headline, sectionNumber); + entity.setRedaction(false); + entity.setMatchedRule(ruleNumber); + entity.setRedactionReason(cellHeader); + entity.setTargetSequences(value.getSequences()); // Make sure no other cells with same content are highlighted + entities.add(entity); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index a250b4a6..9bfd27cd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -53,7 +53,7 @@ public class EntityRedactionService { for (Table table : tables) { for (List row : table.getRows()) { SearchableText searchableRow = new SearchableText(); - Map tabularData = new HashMap<>(); + Map tabularData = new HashMap<>(); for (Cell cell : row) { if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; @@ -63,11 +63,12 @@ public class EntityRedactionService { String headerName = headerCell.getTextBlocks().get(0).getText() .replaceAll("\n", " ") .replaceAll(" ", " "); - tabularData.put(headerName, cell.getTextBlocks().get(0).getText()); + tabularData.put(headerName, cell.getTextBlocks().get(0)); }); for (TextBlock textBlock : cell.getTextBlocks()) { searchableRow.addAll(textBlock.getSequences()); } + } Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber); @@ -124,9 +125,9 @@ public class EntityRedactionService { for (Entity entity : entities) { if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) { - entity.setPositionSequences(text.getSequences(entity.getWord(), true)); + entity.setPositionSequences(text.getSequences(entity.getWord(), true, entity.getTargetSequences())); } else { - entity.setPositionSequences(text.getSequences(entity.getWord(), false)); + entity.setPositionSequences(text.getSequences(entity.getWord(), false, entity.getTargetSequences())); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java index 4e5628de..c118d0e0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java @@ -247,40 +247,29 @@ public class Table extends AbstractTextContainer { List row = rowsOfCells.get(i); Iterator rowCells = row.iterator(); int startColumn = 0; -// int jumpToColumn = 0; + int jumpToColumn = 0; while (rowCells.hasNext()) { Cell cell = rowCells.next(); if (i > 0) { -// Rectangle rectangle = new Rectangle(cell.getBottom(), -// si.getBounds().getLeft(), -// cell.getLeft() - si.getBounds().getLeft() + 1, -// si.getBounds().getBottom() - cell.getBottom()); -// List> others = rowsOfCells(si.contains(rectangle)); -// -// for (List r : others) { -// jumpToColumn = Math.max(jumpToColumn, r.size()); -// } -// -// while (startColumn != jumpToColumn) { -// add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn); -// startColumn++; -// } + Rectangle rectangle = new Rectangle(cell.getBottom(), + si.getBounds().getLeft(), + cell.getLeft() - si.getBounds().getLeft() + 1, + si.getBounds().getBottom() - cell.getBottom()); + List> others = rowsOfCells(si.contains(rectangle)); - // Handle cells spanning several rows - while (previousNonNullCellForColumnIndex.get(startColumn) != null) { - Cell previouslyAddedCellForSameColumn = previousNonNullCellForColumnIndex.get(startColumn); - float previousRight = previouslyAddedCellForSameColumn.getRight(); - float thisLeft = cell.getLeft(); - if (previousRight > thisLeft) { - break; - } + for (List r : others) { + jumpToColumn = Math.max(jumpToColumn, r.size()); + } + + while (startColumn != jumpToColumn) { + add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn); startColumn++; } } add(cell, i, startColumn); previousNonNullCellForColumnIndex.put(startColumn, cell); startColumn++; -// jumpToColumn = startColumn; + jumpToColumn = startColumn; } } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 04003f7f..e771bb49 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -227,6 +227,7 @@ public class RedactionIntegrationTest { @Test public void noExceptionShouldBeThrownForAnyFiles() throws IOException { + System.out.println("noExceptionShouldBeThrownForAnyFiles"); ClassLoader loader = getClass().getClassLoader(); URL url = loader.getResource("files"); File[] files = new File(url.getPath()).listFiles(); @@ -266,6 +267,7 @@ public class RedactionIntegrationTest { @Test public void redactionTest() throws IOException { + System.out.println("redactionTest"); long start = System.currentTimeMillis(); ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/96 Trinexapac-ethyl_RAR_09_Volume_3CA_B-7_2018-02-23.pdf"); @@ -289,8 +291,9 @@ public class RedactionIntegrationTest { @Test public void testTableRedaction() throws IOException { + System.out.println("testTableRedaction"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) @@ -311,6 +314,7 @@ public class RedactionIntegrationTest { @Test public void testManualRedaction() throws IOException { + System.out.println("testManualRedaction"); long start = System.currentTimeMillis(); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf"); @@ -345,6 +349,7 @@ public class RedactionIntegrationTest { @Test public void classificationTest() throws IOException { + System.out.println("classificationTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); @@ -363,6 +368,7 @@ public class RedactionIntegrationTest { @Test public void sectionsTest() throws IOException { + System.out.println("sectionsTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); @@ -381,6 +387,7 @@ public class RedactionIntegrationTest { @Test public void htmlTablesTest() throws IOException { + System.out.println("htmlTablesTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); @@ -399,6 +406,7 @@ public class RedactionIntegrationTest { @Test public void htmlTableRotationTest() throws IOException { + System.out.println("htmlTableRotationTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S" + "-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index 4f046a38..f5d9071e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -15,6 +15,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.io.IOUtils; import org.apache.pdfbox.pdmodel.PDDocument; @@ -47,14 +48,15 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; -@RunWith(SpringRunner.class) @SpringBootTest +@RunWith(SpringRunner.class) public class EntityRedactionServiceTest { private static final String DEFAULT_RULES = loadFromClassPath("drools/rules.drl"); private static final String NAME_CODE = "name"; private static final String ADDRESS_CODE = "address"; + private static final AtomicLong DICTIONARY_VERSION = new AtomicLong(); @MockBean private DictionaryClient dictionaryClient; @@ -117,6 +119,7 @@ public class EntityRedactionServiceTest { DictionaryResponse dictionaryResponse = DictionaryResponse.builder() .entries(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")) .build(); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")) @@ -133,45 +136,80 @@ public class EntityRedactionServiceTest { @Test - public void complexTable() throws IOException { + public void testTrueNegativesInTable() throws IOException { - ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Complex Table.pdf"); - - RedactionRequest redactionRequest = RedactionRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + ClassPathResource pdfFileResource = new ClassPathResource("files/Cyprodinil/40 Cyprodinil - EU AIR3 - LCA Section 1" + + " Supplement - Identity of the active substance - Reference list.pdf"); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/names.txt"))) .build(); - - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)) - .thenReturn(DictionaryResponse.builder().entries(new ArrayList<>()).build()); - when(dictionaryClient.getDictionaryForType(NAME_CODE)) - .thenReturn(DictionaryResponse.builder().entries(new ArrayList<>()).build()); - - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/addresses.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities() + .entrySet() + .stream() + .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); + } + pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " + + "the plant protection product.pdf"); + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities() + .entrySet() + .stream() + .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); } } + @Test + public void testFalsePositiveInWrongCell() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf"); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/names.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/addresses.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // two pages + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 9) + .count()).isEqualTo(10); + } + + } @Test public void headerPropagation() throws IOException { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation.pdf"); - RedactionRequest redactionRequest = RedactionRequest.builder() - .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .build(); - DictionaryResponse dictionaryResponse = DictionaryResponse.builder() .entries(Arrays.asList("Bissig R.", "Thanei P.")) .build(); + + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() .entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); entityRedactionService.processDocument(classifiedDoc, null); assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages @@ -190,10 +228,7 @@ public class EntityRedactionServiceTest { "global Section section\n" + "rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" + " when\n" + - " Section(tabularData != null\n" + - " && tabularData.containsKey(\"Vertebrate study Y/N\")\n" + - " && tabularData.get(\"Vertebrate study Y/N\").equals(\"Y\")\n" + - " )\n" + + " Section(isVertebrateStudy())\n" + " then\n" + " section.redact(\"name\", 9, \"Redacted because row is a vertebrate study\");\n" + " section.redact(\"address\", 9, \"Redacted because rows is a vertebrate study\");\n" + @@ -206,10 +241,12 @@ public class EntityRedactionServiceTest { TypeResult.builder().type(NAME_CODE).color(new float[]{1, 1, 0}).build(), TypeResult.builder().type(ADDRESS_CODE).color(new float[]{0, 1, 1}).build())) .build(); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); when(dictionaryClient.getAllTypes()).thenReturn(typeResponse); when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor()); } + private static String loadFromClassPath(String path) { URL resource = ResourceLoader.class.getClassLoader().getResource(path); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index e89e8eb7..faaeda84 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -49,64 +49,69 @@ rule "5: Do not redact in guideline sections" section.redactNot("address", 5, "Section is a guideline section."); end -rule "6: Redact if must redact entry is found" - when - eval(section.contains("must_redact")==true); - then - section.redact("name", 6, "must_redact entry was found."); - section.redact("address", 6, "must_redact entry was found."); - end - - -rule "7: Redact contact information, if applicant is found" +rule "6: Redact contact information, if applicant is found" when eval(section.headlineContainsWord("applicant") || section.getText().contains("Applicant")); then - section.redactLineAfter("Name:", "address", 7, "Applicant information was found"); - section.redactBetween("Address:", "Contact", "address", 7, "Applicant information was found"); - section.redactLineAfter("Contact point:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Phone:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Fax:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Tel.:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Tel:", "address", 7, "Applicant information was found"); - section.redactLineAfter("E-mail:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Email:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Contact:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Telephone number:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Fax number:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Telephone:", "address", 7, "Applicant information was found"); - section.redactBetween("No:", "Fax", "address", 7, "Applicant information was found"); - section.redactBetween("Contact:", "Tel.:", "address", 7, "Applicant information was found"); + section.redactLineAfter("Name:", "address", 6, "Applicant information was found"); + section.redactBetween("Address:", "Contact", "address", 6, "Applicant information was found"); + section.redactLineAfter("Contact point:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Phone:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Fax:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Tel.:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Tel:", "address", 6, "Applicant information was found"); + section.redactLineAfter("E-mail:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Email:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Contact:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Telephone number:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Fax number:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Telephone:", "address", 6, "Applicant information was found"); + section.redactBetween("No:", "Fax", "address", 6, "Applicant information was found"); + section.redactBetween("Contact:", "Tel.:", "address", 6, "Applicant information was found"); end -rule "8: Redact contact information, if Producer is found" +rule "7: Redact contact information, if Producer is found" when eval(section.getText().toLowerCase().contains("producer of the plant protection") || section.getText().toLowerCase().contains("producer of the active substance") || section.getText().contains("Manufacturer of the active substance") || section.getText().contains("Manufacturer:") || section.getText().contains("Producer or producers of the active substance")); then - section.redactLineAfter("Name:", "address", 8, "Producer was found"); - section.redactBetween("Address:", "Contact", "address", 8, "Producer was found"); - section.redactBetween("Contact:", "Phone", "address", 8, "Producer was found"); - section.redactBetween("Contact:", "Telephone number:", "address", 8, "Producer was found"); - section.redactBetween("Address:", "Manufacturing", "address", 8, "Producer was found"); - section.redactLineAfter("Telephone:", "address", 8, "Producer was found"); - section.redactLineAfter("Phone:", "address", 8, "Producer was found"); - section.redactLineAfter("Fax:", "address", 8, "Producer was found"); - section.redactLineAfter("E-mail:", "address", 8, "Producer was found"); - section.redactLineAfter("Contact:", "address", 8, "Producer was found"); - section.redactLineAfter("Fax number:", "address", 8, "Producer was found"); - section.redactLineAfter("Telephone number:", "address", 8, "Producer was found"); - section.redactLineAfter("Tel:", "address", 8, "Producer was found"); - section.redactBetween("No:", "Fax", "address", 8, "Producer was found"); + section.redactLineAfter("Name:", "address", 7, "Producer was found"); + section.redactBetween("Address:", "Contact", "address", 7, "Producer was found"); + section.redactBetween("Contact:", "Phone", "address", 7, "Producer was found"); + section.redactBetween("Contact:", "Telephone number:", "address", 7, "Producer was found"); + section.redactBetween("Address:", "Manufacturing", "address", 7, "Producer was found"); + section.redactLineAfter("Telephone:", "address", 7, "Producer was found"); + section.redactLineAfter("Phone:", "address", 7, "Producer was found"); + section.redactLineAfter("Fax:", "address", 7, "Producer was found"); + section.redactLineAfter("E-mail:", "address", 7, "Producer was found"); + section.redactLineAfter("Contact:", "address", 7, "Producer was found"); + section.redactLineAfter("Fax number:", "address", 7, "Producer was found"); + section.redactLineAfter("Telephone number:", "address", 7, "Producer was found"); + section.redactLineAfter("Tel:", "address", 7, "Producer was found"); + section.redactBetween("No:", "Fax", "address", 7, "Producer was found"); end -rule "9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study" +rule "8: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study" when - Section(tabularData != null - && tabularData.containsKey("Vertebrate study Y/N") - && tabularData.get("Vertebrate study Y/N").equals("Y") - ) + Section(isVertebrateStudy()) then - section.redact("name", 9, "Redacted because row is a vertebrate study"); - section.redact("address", 9, "Redacted because rows is a vertebrate study"); + section.redact("name", 8, "Redacted because row is a vertebrate study"); + section.redact("address", 8, "Redacted because row is a vertebrate study"); section.highlightCell("Vertebrate study Y/N", 9); - end \ No newline at end of file + end + +rule "9: Not redacted because Vertebrate Study = N" + when + Section(isNotVertebrateStudy()) + then + section.redactNot("name", 9, "Not redacted because row is not a vertebrate study"); + section.redactNot("address", 9, "Not redacted because row is not a vertebrate study"); + end + + +rule "10: Redact if must redact entry is found" + when + eval(section.contains("must_redact")==true); + then + section.redact("name", 10, "must_redact entry was found."); + section.redact("address", 10, "must_redact entry was found."); + end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Row With Ambiguous Redaction.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Row With Ambiguous Redaction.pdf new file mode 100644 index 00000000..4943f1a0 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Row With Ambiguous Redaction.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Spanning Cells.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Spanning Cells.pdf new file mode 100644 index 00000000..db5abbcd Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Spanning Cells.pdf differ