RED-3946 - quotation-mark fix

This commit is contained in:
Timo Bejan 2022-05-09 12:14:09 +03:00
parent 01e7115d30
commit df2fcd2e5e
5 changed files with 32 additions and 25 deletions

View File

@ -3,6 +3,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.redaction.utils.SeparatorUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import java.util.ArrayList;
@ -92,8 +93,8 @@ public class SearchableText {
.get(i)
.charAt(j, caseInsensitive) == '-') {
if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(searchSpace.get(i)
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(searchSpace.get(i - 1)
if (counter != 0 || i == 0 && j == 0 || j != 0 && SeparatorUtils.isSeparator(searchSpace.get(i)
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && SeparatorUtils.isSeparator(searchSpace.get(i - 1)
.charAt(searchSpace.get(i - 1)
.length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1)
.charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i)
@ -109,9 +110,9 @@ public class SearchableText {
crossSequenceParts.add(partMatch);
if (i == searchSpace.size() - 1 && j == searchSpace.get(i)
.length() - 1 || j != searchSpace.get(i).length() - 1 && isSeparator(searchSpace.get(i)
.length() - 1 || j != searchSpace.get(i).length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i)
.charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i)
.length() - 1 && isSeparator(searchSpace.get(i + 1)
.length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i + 1)
.charAt(0, caseInsensitive)) || j == searchSpace.get(i)
.length() - 1 && searchSpace.get(i)
.charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1)
@ -179,12 +180,6 @@ public class SearchableText {
}
private boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '' || c == '”';
}
@Override
public String toString() {
return buildString(sequences);

View File

@ -4,7 +4,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import org.ahocorasick.trie.Trie;
import java.util.*;
import java.util.regex.Pattern;
@ -47,18 +46,14 @@ public class EntitySearchUtils {
private void validateAndAddEntity(Set<Entity> entities, FindEntityDetails findEntityDetails, String inputString, int startIndex, int stopIndex) {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if ((startIndex == 0 || SeparatorUtils.isSeparator(inputString.charAt(startIndex - 1)))
&& (stopIndex == inputString.length() || SeparatorUtils.isSeparator(inputString.charAt(stopIndex)))) {
entities.add(new Entity(inputString.substring(startIndex, stopIndex), findEntityDetails.getType(), startIndex, stopIndex,
findEntityDetails.getHeadline(), findEntityDetails.getSectionNumber(), findEntityDetails.isDictionaryEntry(),
findEntityDetails.isDossierDictionary(), findEntityDetails.getEngine(), findEntityDetails.getEntityType()));
}
}
private boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
}
public Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text, Dictionary dictionary) {

View File

@ -0,0 +1,15 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.util.Set;
import java.util.regex.Pattern;
public class SeparatorUtils {
private final static Set<Character> quotes = Set.of('\'', '\u0022', '\u00AB', '\u00BB', '\u2018', '\u2019', '\u201A', '\u201C', '\u201D', '\u201E'
, '\u2039', '\u203A');
public static boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || quotes.contains(c);
}
}

View File

@ -7,7 +7,6 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.MessageType;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.junit.Ignore;
import org.junit.Test;
import org.springframework.core.io.ClassPathResource;
@ -18,7 +17,7 @@ import java.util.Set;
@Slf4j
public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest {
public static final String FILE_NAME = "test-file";
@Test
@SneakyThrows
@ -27,12 +26,12 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
ObjectMapper om = new ObjectMapper();
om.registerModule(new JavaTimeModule());
var file = new ClassPathResource(BASE_DIR + "data/test-file.pdf").getInputStream();
var file = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".pdf").getInputStream();
redactionStorageService.storeObject("dossierId", "fileId", FileType.ORIGIN, file);
try {
var nerData = new ClassPathResource(BASE_DIR + "data/test-file.ner.json").getInputStream();
var nerData = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".ner.json").getInputStream();
redactionStorageService.storeObject("dossierId", "fileId", FileType.NER_ENTITIES, nerData);
} catch (Exception e) {
log.warn("No NER File Provided");
@ -52,8 +51,8 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
try {
var text = new ClassPathResource(BASE_DIR + "data/test-file.text.json").getInputStream();
var sectionText = new ClassPathResource(BASE_DIR + "data/test-file.section-grid.json").getInputStream();
var text = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".text.json").getInputStream();
var sectionText = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".section-grid.json").getInputStream();
redactionStorageService.storeObject("dossierId", "fileId", FileType.TEXT, text);
redactionStorageService.storeObject("dossierId", "fileId", FileType.SECTION_GRID, sectionText);
} catch (Exception e) {
@ -66,7 +65,7 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
try {
var redactionLog = new ClassPathResource(BASE_DIR + "data/test-file.redaction-log.json").getInputStream();
var redactionLog = new ClassPathResource(BASE_DIR + "data/" + FILE_NAME + ".redaction-log.json").getInputStream();
} catch (Exception e) {
log.info("No redaction log provided, Performing full analysis");
@ -76,10 +75,13 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
}
simulateIncrement(List.of("type"), "PII", 3L);
simulateIncrement(List.of("Desiree"), "PII", 3L);
ar.setMessageType(MessageType.REANALYSE);
String in = om.writeValueAsString(ar);
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
// var log = redactionStorageService.getRedactionLog("dossierId","fileId");
}
}