RED-3946 - quotation-mark fix
This commit is contained in:
parent
01e7115d30
commit
df2fcd2e5e
@ -3,6 +3,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
|
|||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.SeparatorUtils;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -92,8 +93,8 @@ public class SearchableText {
|
|||||||
.get(i)
|
.get(i)
|
||||||
.charAt(j, caseInsensitive) == '-') {
|
.charAt(j, caseInsensitive) == '-') {
|
||||||
|
|
||||||
if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(searchSpace.get(i)
|
if (counter != 0 || i == 0 && j == 0 || j != 0 && SeparatorUtils.isSeparator(searchSpace.get(i)
|
||||||
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(searchSpace.get(i - 1)
|
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && SeparatorUtils.isSeparator(searchSpace.get(i - 1)
|
||||||
.charAt(searchSpace.get(i - 1)
|
.charAt(searchSpace.get(i - 1)
|
||||||
.length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1)
|
.length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1)
|
||||||
.charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i)
|
.charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i)
|
||||||
@ -109,9 +110,9 @@ public class SearchableText {
|
|||||||
crossSequenceParts.add(partMatch);
|
crossSequenceParts.add(partMatch);
|
||||||
|
|
||||||
if (i == searchSpace.size() - 1 && j == searchSpace.get(i)
|
if (i == searchSpace.size() - 1 && j == searchSpace.get(i)
|
||||||
.length() - 1 || j != searchSpace.get(i).length() - 1 && isSeparator(searchSpace.get(i)
|
.length() - 1 || j != searchSpace.get(i).length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i)
|
||||||
.charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i)
|
.charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i)
|
||||||
.length() - 1 && isSeparator(searchSpace.get(i + 1)
|
.length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i + 1)
|
||||||
.charAt(0, caseInsensitive)) || j == searchSpace.get(i)
|
.charAt(0, caseInsensitive)) || j == searchSpace.get(i)
|
||||||
.length() - 1 && searchSpace.get(i)
|
.length() - 1 && searchSpace.get(i)
|
||||||
.charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1)
|
.charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1)
|
||||||
@ -179,12 +180,6 @@ public class SearchableText {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean isSeparator(char c) {
|
|
||||||
|
|
||||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’' || c == '”';
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return buildString(sequences);
|
return buildString(sequences);
|
||||||
|
|||||||
@ -4,7 +4,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
|||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.ahocorasick.trie.Trie;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
@ -47,18 +46,14 @@ public class EntitySearchUtils {
|
|||||||
|
|
||||||
private void validateAndAddEntity(Set<Entity> entities, FindEntityDetails findEntityDetails, String inputString, int startIndex, int stopIndex) {
|
private void validateAndAddEntity(Set<Entity> entities, FindEntityDetails findEntityDetails, String inputString, int startIndex, int stopIndex) {
|
||||||
|
|
||||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
if ((startIndex == 0 || SeparatorUtils.isSeparator(inputString.charAt(startIndex - 1)))
|
||||||
|
&& (stopIndex == inputString.length() || SeparatorUtils.isSeparator(inputString.charAt(stopIndex)))) {
|
||||||
entities.add(new Entity(inputString.substring(startIndex, stopIndex), findEntityDetails.getType(), startIndex, stopIndex,
|
entities.add(new Entity(inputString.substring(startIndex, stopIndex), findEntityDetails.getType(), startIndex, stopIndex,
|
||||||
findEntityDetails.getHeadline(), findEntityDetails.getSectionNumber(), findEntityDetails.isDictionaryEntry(),
|
findEntityDetails.getHeadline(), findEntityDetails.getSectionNumber(), findEntityDetails.isDictionaryEntry(),
|
||||||
findEntityDetails.isDossierDictionary(), findEntityDetails.getEngine(), findEntityDetails.getEntityType()));
|
findEntityDetails.isDossierDictionary(), findEntityDetails.getEngine(), findEntityDetails.getEntityType()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isSeparator(char c) {
|
|
||||||
|
|
||||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text, Dictionary dictionary) {
|
public Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text, Dictionary dictionary) {
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,15 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class SeparatorUtils {
|
||||||
|
|
||||||
|
private final static Set<Character> quotes = Set.of('\'', '\u0022', '\u00AB', '\u00BB', '\u2018', '\u2019', '\u201A', '\u201C', '\u201D', '\u201E'
|
||||||
|
, '\u2039', '\u203A');
|
||||||
|
|
||||||
|
|
||||||
|
public static boolean isSeparator(char c) {
|
||||||
|
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || quotes.contains(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -7,7 +7,6 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
|||||||
import com.iqser.red.service.redaction.v1.model.MessageType;
|
import com.iqser.red.service.redaction.v1.model.MessageType;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
@ -18,7 +17,7 @@ import java.util.Set;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest {
|
public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest {
|
||||||
|
|
||||||
|
public static final String FILE_NAME = "test-file";
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@ -27,12 +26,12 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
|||||||
ObjectMapper om = new ObjectMapper();
|
ObjectMapper om = new ObjectMapper();
|
||||||
om.registerModule(new JavaTimeModule());
|
om.registerModule(new JavaTimeModule());
|
||||||
|
|
||||||
var file = new ClassPathResource(BASE_DIR + "data/test-file.pdf").getInputStream();
|
var file = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".pdf").getInputStream();
|
||||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.ORIGIN, file);
|
redactionStorageService.storeObject("dossierId", "fileId", FileType.ORIGIN, file);
|
||||||
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var nerData = new ClassPathResource(BASE_DIR + "data/test-file.ner.json").getInputStream();
|
var nerData = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".ner.json").getInputStream();
|
||||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.NER_ENTITIES, nerData);
|
redactionStorageService.storeObject("dossierId", "fileId", FileType.NER_ENTITIES, nerData);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.warn("No NER File Provided");
|
log.warn("No NER File Provided");
|
||||||
@ -52,8 +51,8 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
|||||||
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var text = new ClassPathResource(BASE_DIR + "data/test-file.text.json").getInputStream();
|
var text = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".text.json").getInputStream();
|
||||||
var sectionText = new ClassPathResource(BASE_DIR + "data/test-file.section-grid.json").getInputStream();
|
var sectionText = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".section-grid.json").getInputStream();
|
||||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.TEXT, text);
|
redactionStorageService.storeObject("dossierId", "fileId", FileType.TEXT, text);
|
||||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.SECTION_GRID, sectionText);
|
redactionStorageService.storeObject("dossierId", "fileId", FileType.SECTION_GRID, sectionText);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@ -66,7 +65,7 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
|||||||
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var redactionLog = new ClassPathResource(BASE_DIR + "data/test-file.redaction-log.json").getInputStream();
|
var redactionLog = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".redaction-log.json").getInputStream();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.info("No redaction log provided, Performing full analysis");
|
log.info("No redaction log provided, Performing full analysis");
|
||||||
|
|
||||||
@ -76,10 +75,13 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
simulateIncrement(List.of("type"), "PII", 3L);
|
simulateIncrement(List.of("Desiree"), "PII", 3L);
|
||||||
ar.setMessageType(MessageType.REANALYSE);
|
ar.setMessageType(MessageType.REANALYSE);
|
||||||
String in = om.writeValueAsString(ar);
|
String in = om.writeValueAsString(ar);
|
||||||
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
|
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
|
||||||
|
|
||||||
|
|
||||||
|
// var log = redactionStorageService.getRedactionLog("dossierId","fileId");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user