RED-3946 - quotation-mark fix
This commit is contained in:
parent
01e7115d30
commit
df2fcd2e5e
@ -3,6 +3,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.SeparatorUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -92,8 +93,8 @@ public class SearchableText {
|
||||
.get(i)
|
||||
.charAt(j, caseInsensitive) == '-') {
|
||||
|
||||
if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(searchSpace.get(i)
|
||||
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(searchSpace.get(i - 1)
|
||||
if (counter != 0 || i == 0 && j == 0 || j != 0 && SeparatorUtils.isSeparator(searchSpace.get(i)
|
||||
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && SeparatorUtils.isSeparator(searchSpace.get(i - 1)
|
||||
.charAt(searchSpace.get(i - 1)
|
||||
.length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1)
|
||||
.charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i)
|
||||
@ -109,9 +110,9 @@ public class SearchableText {
|
||||
crossSequenceParts.add(partMatch);
|
||||
|
||||
if (i == searchSpace.size() - 1 && j == searchSpace.get(i)
|
||||
.length() - 1 || j != searchSpace.get(i).length() - 1 && isSeparator(searchSpace.get(i)
|
||||
.length() - 1 || j != searchSpace.get(i).length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i)
|
||||
.charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i)
|
||||
.length() - 1 && isSeparator(searchSpace.get(i + 1)
|
||||
.length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i + 1)
|
||||
.charAt(0, caseInsensitive)) || j == searchSpace.get(i)
|
||||
.length() - 1 && searchSpace.get(i)
|
||||
.charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1)
|
||||
@ -179,12 +180,6 @@ public class SearchableText {
|
||||
}
|
||||
|
||||
|
||||
private boolean isSeparator(char c) {
|
||||
|
||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’' || c == '”';
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return buildString(sequences);
|
||||
|
||||
@ -4,7 +4,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
@ -47,18 +46,14 @@ public class EntitySearchUtils {
|
||||
|
||||
private void validateAndAddEntity(Set<Entity> entities, FindEntityDetails findEntityDetails, String inputString, int startIndex, int stopIndex) {
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
if ((startIndex == 0 || SeparatorUtils.isSeparator(inputString.charAt(startIndex - 1)))
|
||||
&& (stopIndex == inputString.length() || SeparatorUtils.isSeparator(inputString.charAt(stopIndex)))) {
|
||||
entities.add(new Entity(inputString.substring(startIndex, stopIndex), findEntityDetails.getType(), startIndex, stopIndex,
|
||||
findEntityDetails.getHeadline(), findEntityDetails.getSectionNumber(), findEntityDetails.isDictionaryEntry(),
|
||||
findEntityDetails.isDossierDictionary(), findEntityDetails.getEngine(), findEntityDetails.getEntityType()));
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSeparator(char c) {
|
||||
|
||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
||||
}
|
||||
|
||||
|
||||
public Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text, Dictionary dictionary) {
|
||||
|
||||
|
||||
@ -0,0 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class SeparatorUtils {
|
||||
|
||||
private final static Set<Character> quotes = Set.of('\'', '\u0022', '\u00AB', '\u00BB', '\u2018', '\u2019', '\u201A', '\u201C', '\u201D', '\u201E'
|
||||
, '\u2039', '\u203A');
|
||||
|
||||
|
||||
public static boolean isSeparator(char c) {
|
||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || quotes.contains(c);
|
||||
}
|
||||
}
|
||||
@ -7,7 +7,6 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.MessageType;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
@ -18,7 +17,7 @@ import java.util.Set;
|
||||
@Slf4j
|
||||
public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest {
|
||||
|
||||
|
||||
public static final String FILE_NAME = "test-file";
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@ -27,12 +26,12 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
||||
ObjectMapper om = new ObjectMapper();
|
||||
om.registerModule(new JavaTimeModule());
|
||||
|
||||
var file = new ClassPathResource(BASE_DIR + "data/test-file.pdf").getInputStream();
|
||||
var file = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".pdf").getInputStream();
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.ORIGIN, file);
|
||||
|
||||
|
||||
try {
|
||||
var nerData = new ClassPathResource(BASE_DIR + "data/test-file.ner.json").getInputStream();
|
||||
var nerData = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".ner.json").getInputStream();
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.NER_ENTITIES, nerData);
|
||||
} catch (Exception e) {
|
||||
log.warn("No NER File Provided");
|
||||
@ -52,8 +51,8 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
||||
|
||||
|
||||
try {
|
||||
var text = new ClassPathResource(BASE_DIR + "data/test-file.text.json").getInputStream();
|
||||
var sectionText = new ClassPathResource(BASE_DIR + "data/test-file.section-grid.json").getInputStream();
|
||||
var text = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".text.json").getInputStream();
|
||||
var sectionText = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".section-grid.json").getInputStream();
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.TEXT, text);
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.SECTION_GRID, sectionText);
|
||||
} catch (Exception e) {
|
||||
@ -66,7 +65,7 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
||||
|
||||
|
||||
try {
|
||||
var redactionLog = new ClassPathResource(BASE_DIR + "data/test-file.redaction-log.json").getInputStream();
|
||||
var redactionLog = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".redaction-log.json").getInputStream();
|
||||
} catch (Exception e) {
|
||||
log.info("No redaction log provided, Performing full analysis");
|
||||
|
||||
@ -76,10 +75,13 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
||||
}
|
||||
|
||||
|
||||
simulateIncrement(List.of("type"), "PII", 3L);
|
||||
simulateIncrement(List.of("Desiree"), "PII", 3L);
|
||||
ar.setMessageType(MessageType.REANALYSE);
|
||||
String in = om.writeValueAsString(ar);
|
||||
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
|
||||
|
||||
|
||||
// var log = redactionStorageService.getRedactionLog("dossierId","fileId");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user