Fixed renalysis for caseinsensitive dictionary entries

This commit is contained in:
Dominique Eifländer 2021-03-11 13:53:16 +01:00
parent 511092b9e7
commit f4ea236fc5
6 changed files with 60 additions and 9 deletions

View File

@ -9,7 +9,7 @@ import lombok.Data;
@AllArgsConstructor
public class DictionaryIncrement {
private Set<String> values;
private Set<DictionaryIncrementValue> values;
private long dictionaryVersion;
}

View File

@ -0,0 +1,13 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class DictionaryIncrementValue {
private String value;
private boolean caseinsensitive;
}

View File

@ -22,6 +22,7 @@ import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation;
@ -57,12 +58,12 @@ public class DictionaryService {
long version = updateDictionary(ruleSetId);
Set<String> newValues = new HashSet<>();
Set<DictionaryIncrementValue> newValues = new HashSet<>();
List<DictionaryModel> dictionaryModels = dictionariesByRuleSets.get(ruleSetId).getDictionary();
dictionaryModels.forEach(dictionaryModel -> {
dictionaryModel.getEntries().forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion) {
newValues.add(dictionaryEntry.getValue());
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
}
});
});

View File

@ -82,9 +82,8 @@ public class ReanalyzeService {
}
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
Set<Entity> entities = EntitySearchUtils.find(sectionText.getText(), dictionaryIncrement.getValues(), "find", sectionText
.getHeadline(), sectionText.getSectionNumber(), false);
if (!entities.isEmpty()) {
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
sectionsToReanaylse.add(sectionText.getSectionNumber());
}

View File

@ -5,12 +5,14 @@ import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
@ -20,8 +22,40 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@UtilityClass
@SuppressWarnings("PMD")
public class EntitySearchUtils {
public boolean sectionContainsAny(String sectionText, Set<DictionaryIncrementValue> values) {
String inputString = sectionText.toLowerCase(Locale.ROOT);
for (DictionaryIncrementValue value : values) {
String cleanValue = value.getValue().toLowerCase(Locale.ROOT).trim();
if (cleanValue.length() <= 2) {
continue;
}
int startIndex;
int stopIndex = 0;
do {
startIndex = inputString.indexOf(cleanValue, stopIndex);
stopIndex = startIndex + cleanValue.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if(value.isCaseinsensitive() || !value.isCaseinsensitive() && sectionText.substring(startIndex, stopIndex).equals(value.getValue())){
return true;
}
}
} while (startIndex > -1);
}
return false;
}
public Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber,
boolean local) {

View File

@ -498,7 +498,7 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_13_Volume_3CP_A9396G_B-1_2018-09-06.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
@ -543,8 +543,12 @@ public class RedactionIntegrationTest {
dictionary.get(AUTHOR).add("physical");
reanlysisVersions.put("physical", 2L);
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(2L);
when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(AUTHOR));
dictionary.get(VERTEBRATE).add("s-metolachlor");
reanlysisVersions.put("s-metolachlor", 3L);
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(3L);
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(VERTEBRATE));
start = System.currentTimeMillis();
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()