diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index 253fe54e..c3f110a9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -43,7 +43,8 @@ public class EntitySearchUtils { startIndex = inputString.indexOf(cleanValue, stopIndex); stopIndex = startIndex + cleanValue.length(); - if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { + if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString + .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { return true; } } while (startIndex > -1); @@ -71,7 +72,8 @@ public class EntitySearchUtils { startIndex = inputString.indexOf(cleanValue, stopIndex); stopIndex = startIndex + cleanValue.length(); - if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { + if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString + .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary)); } } while (startIndex > -1); @@ -125,7 +127,8 @@ public class EntitySearchUtils { for (Entity word : entities) { for (Entity inner : entities) { if (inner.getWord().length() < word.getWord() - .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) { + .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word + .getSectionNumber() == inner.getSectionNumber()) { wordsToRemove.add(inner); } } @@ -163,15 +166,35 @@ public class EntitySearchUtils { Set result = new HashSet<>(); if (existingEntities != null && foundEntities != null) { - for (Entity existingEntity : existingEntities) { - for (Entity foundEntity : foundEntities) { - if (existingEntity.getEnd() < foundEntity.getStart() || foundEntity.getEnd() < existingEntity.getStart()) { - result.add(foundEntity); - } + for (Entity foundEntity : foundEntities) { + + if (!overlaps(existingEntities, foundEntity)) { + result.add(foundEntity); } + } } return result; } + + private boolean overlaps(Set existingEntities, Entity found) { + + for (Entity existing : existingEntities) { + + if(existing.getStart().equals(found.getStart())){ + continue; + } + + for (int i = existing.getStart(); i <= existing.getEnd(); i++) { + for (int j = found.getStart(); j <= found.getEnd(); j++) { + if (i == j) { + return true; + } + } + } + } + return false; + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index bb0f19bb..46db8067 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -652,11 +652,28 @@ public class RedactionIntegrationTest { } + @Test + public void redactionExpansionOverlap() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/ExpansionTest.pdf"); + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + + + AnalyzeResult result = reanalyzeService.analyze(request); + + var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); + + var values = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).collect(Collectors.toList()); + + assertThat(values).containsExactlyInAnyOrder("Lastname M.", "Doe", "Doe J.", "M. Mustermann", "Mustermann M.", "F. Lastname"); + } + + @Test public void redactionTest() throws IOException { long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/new/VV-919901.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); request.setExcludedPages(Set.of(1)); @@ -1121,10 +1138,10 @@ public class RedactionIntegrationTest { private static String getTemporaryDirectory() { - String tmpdir = System.getProperty("java.io.tmpdir"); - if (StringUtils.isNotBlank(tmpdir)) { - return tmpdir; - } +// String tmpdir = System.getProperty("java.io.tmpdir"); +// if (StringUtils.isNotBlank(tmpdir)) { +// return tmpdir; +// } return "/tmp"; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java index 55c279df..59b2fff3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java @@ -134,8 +134,7 @@ public class EntitySearchUtilsTest { Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); // Assert - assertThat(result.size()).isEqualTo(1); - assertThat(result).contains(foundEntities2); + assertThat(result.size()).isEqualTo(0); } @@ -165,8 +164,7 @@ public class EntitySearchUtilsTest { Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); // Assert - assertThat(result.size()).isEqualTo(1); - assertThat(result).contains(foundEntities2); + assertThat(result.size()).isEqualTo(0); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt index 3d53e6d2..c8efcf01 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt @@ -8585,4 +8585,9 @@ Zoriki Hosomi R. Zoriki Hosomi Rosana Zuberer D Zubrod J -Zwicker R.E. \ No newline at end of file +Zwicker R.E. +Doe +M. Mustermann +F. Lastname +Mustermann +Lastname \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/ExpansionTest.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/ExpansionTest.pdf new file mode 100644 index 00000000..5d55d76a Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/ExpansionTest.pdf differ