diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index 774b3b6c..3d89445a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -3,6 +3,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; +import com.iqser.red.service.redaction.v1.server.redaction.utils.SeparatorUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import java.util.ArrayList; @@ -92,8 +93,8 @@ public class SearchableText { .get(i) .charAt(j, caseInsensitive) == '-') { - if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(searchSpace.get(i) - .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(searchSpace.get(i - 1) + if (counter != 0 || i == 0 && j == 0 || j != 0 && SeparatorUtils.isSeparator(searchSpace.get(i) + .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && SeparatorUtils.isSeparator(searchSpace.get(i - 1) .charAt(searchSpace.get(i - 1) .length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1) .charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i) @@ -109,9 +110,9 @@ public class SearchableText { crossSequenceParts.add(partMatch); if (i == searchSpace.size() - 1 && j == searchSpace.get(i) - .length() - 1 || j != searchSpace.get(i).length() - 1 && isSeparator(searchSpace.get(i) + .length() - 1 || j != searchSpace.get(i).length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i) .charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i) - .length() - 1 && isSeparator(searchSpace.get(i + 1) + .length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i + 1) .charAt(0, caseInsensitive)) || j == searchSpace.get(i) .length() - 1 && searchSpace.get(i) .charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1) @@ -179,12 +180,6 @@ public class SearchableText { } - private boolean isSeparator(char c) { - - return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’' || c == '”'; - } - - @Override public String toString() { return buildString(sequences); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index 0fc10144..267bd156 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -4,7 +4,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import com.iqser.red.service.redaction.v1.server.redaction.model.*; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; -import org.ahocorasick.trie.Trie; import java.util.*; import java.util.regex.Pattern; @@ -47,18 +46,14 @@ public class EntitySearchUtils { private void validateAndAddEntity(Set entities, FindEntityDetails findEntityDetails, String inputString, int startIndex, int stopIndex) { - if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { + if ((startIndex == 0 || SeparatorUtils.isSeparator(inputString.charAt(startIndex - 1))) + && (stopIndex == inputString.length() || SeparatorUtils.isSeparator(inputString.charAt(stopIndex)))) { entities.add(new Entity(inputString.substring(startIndex, stopIndex), findEntityDetails.getType(), startIndex, stopIndex, findEntityDetails.getHeadline(), findEntityDetails.getSectionNumber(), findEntityDetails.isDictionaryEntry(), findEntityDetails.isDossierDictionary(), findEntityDetails.getEngine(), findEntityDetails.getEntityType())); } } - private boolean isSeparator(char c) { - - return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’'; - } - public Set clearAndFindPositions(Set entities, SearchableText text, Dictionary dictionary) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/SeparatorUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/SeparatorUtils.java new file mode 100644 index 00000000..5e4766ec --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/SeparatorUtils.java @@ -0,0 +1,15 @@ +package com.iqser.red.service.redaction.v1.server.redaction.utils; + +import java.util.Set; +import java.util.regex.Pattern; + +public class SeparatorUtils { + + private final static Set quotes = Set.of('\'', '\u0022', '\u00AB', '\u00BB', '\u2018', '\u2019', '\u201A', '\u201C', '\u201D', '\u201E' + , '\u2039', '\u203A'); + + + public static boolean isSeparator(char c) { + return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || quotes.contains(c); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/AnalyseFileRealDataIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/AnalyseFileRealDataIntegrationTest.java index 760c1993..42101901 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/AnalyseFileRealDataIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/AnalyseFileRealDataIntegrationTest.java @@ -7,7 +7,6 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; import com.iqser.red.service.redaction.v1.model.MessageType; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import org.junit.Ignore; import org.junit.Test; import org.springframework.core.io.ClassPathResource; @@ -18,7 +17,7 @@ import java.util.Set; @Slf4j public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest { - + public static final String FILE_NAME = "test-file"; @Test @SneakyThrows @@ -27,12 +26,12 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest ObjectMapper om = new ObjectMapper(); om.registerModule(new JavaTimeModule()); - var file = new ClassPathResource(BASE_DIR + "data/test-file.pdf").getInputStream(); + var file = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".pdf").getInputStream(); redactionStorageService.storeObject("dossierId", "fileId", FileType.ORIGIN, file); try { - var nerData = new ClassPathResource(BASE_DIR + "data/test-file.ner.json").getInputStream(); + var nerData = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".ner.json").getInputStream(); redactionStorageService.storeObject("dossierId", "fileId", FileType.NER_ENTITIES, nerData); } catch (Exception e) { log.warn("No NER File Provided"); @@ -52,8 +51,8 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest try { - var text = new ClassPathResource(BASE_DIR + "data/test-file.text.json").getInputStream(); - var sectionText = new ClassPathResource(BASE_DIR + "data/test-file.section-grid.json").getInputStream(); + var text = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".text.json").getInputStream(); + var sectionText = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".section-grid.json").getInputStream(); redactionStorageService.storeObject("dossierId", "fileId", FileType.TEXT, text); redactionStorageService.storeObject("dossierId", "fileId", FileType.SECTION_GRID, sectionText); } catch (Exception e) { @@ -66,7 +65,7 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest try { - var redactionLog = new ClassPathResource(BASE_DIR + "data/test-file.redaction-log.json").getInputStream(); + var redactionLog = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".redaction-log.json").getInputStream(); } catch (Exception e) { log.info("No redaction log provided, Performing full analysis"); @@ -76,10 +75,13 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest } - simulateIncrement(List.of("type"), "PII", 3L); + simulateIncrement(List.of("Desiree"), "PII", 3L); ar.setMessageType(MessageType.REANALYSE); String in = om.writeValueAsString(ar); redactionMessageReceiver.receiveAnalyzeRequest(in, false); + + + // var log = redactionStorageService.getRedactionLog("dossierId","fileId"); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/performance/data/test-quote.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/performance/data/test-quote.pdf new file mode 100644 index 00000000..2b0fb6e9 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/performance/data/test-quote.pdf differ