RED-1970: Ignore NER entities that are found over multiple table columns

This commit is contained in:
Dominique Eifländer 2021-09-07 14:56:19 +02:00
parent 8de655d884
commit 6a75fc74d6
2 changed files with 31 additions and 12 deletions

View File

@ -1,6 +1,5 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@ -20,9 +19,8 @@ import com.iqser.red.service.redaction.v1.model.ManualImageRecategorization;
import com.iqser.red.service.redaction.v1.model.Status;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.EntityRecognitionClient;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecogintionEntity;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionSection;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
@ -77,7 +75,8 @@ public class EntityRedactionService {
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection
.getSectionNumber(), dictionary, local, nerEntities);
.getSectionNumber(), dictionary, local, nerEntities, reanalysisSection.getCellStarts());
if (reanalysisSection.getCellStarts() != null && !reanalysisSection.getCellStarts().isEmpty()) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
@ -199,7 +198,8 @@ public class EntityRedactionService {
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
Dictionary dictionary, boolean local, NerEntities nerEntities) {
Dictionary dictionary, boolean local, NerEntities nerEntities,
List<Integer> cellstarts) {
Set<Entity> found = new HashSet<>();
String searchableString = searchableText.toString();
@ -210,30 +210,49 @@ public class EntityRedactionService {
String lowercaseInputString = searchableString.toLowerCase();
for (DictionaryModel model : dictionary.getDictionaryModels()) {
if (model.isCaseInsensitive()) {
found.addAll(EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local, model
found.addAll(EntitySearchUtils.find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, !local, model
.isDossierDictionary()));
} else {
found.addAll(EntitySearchUtils.find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local, model
found.addAll(EntitySearchUtils.find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, !local, model
.isDossierDictionary()));
}
}
if (!local) {
addNerEntities(found, sectionNumber, headline, nerEntities);
Map<String, Set<String>> nerValuesPerType = getNerValues(sectionNumber, nerEntities, cellstarts);
nerValuesPerType.entrySet().forEach(entry -> {
found.addAll(EntitySearchUtils.find(searchableString, entry.getValue(), entry.getKey(), headline, sectionNumber, false, false));
});
}
return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary);
}
private void addNerEntities(Set<Entity> found, int sectionNumber, String headline, NerEntities nerEntities) {
private Map<String, Set<String>> getNerValues(int sectionNumber, NerEntities nerEntities,
List<Integer> cellstarts) {
Map<String, Set<String>> nerValuesPerType = new HashMap<>();
if (redactionServiceSettings.isEnableEntityRecognition() && nerEntities.getResult()
.containsKey(sectionNumber)) {
nerEntities.getResult().get(sectionNumber).forEach(res -> {
found.add(new Entity(new String(Base64.decodeBase64(res.getValue().getBytes())), res.getType(), res.getStartOffset(), res.getEndOffset(), headline, sectionNumber, false, false));
if (cellstarts == null || cellstarts.isEmpty()) {
nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>()).add(new String(Base64.decodeBase64(res.getValue().getBytes())));
} else {
boolean intersectsCellStart = false;
for (Integer cellStart : cellstarts) {
if (res.getStartOffset() < cellStart && cellStart < res.getEndOffset()) {
intersectsCellStart = true;
}
}
if (!intersectsCellStart) {
nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>()).add(new String(Base64.decodeBase64(res.getValue().getBytes())));
}
}
});
}
return nerValuesPerType;
}
}

View File

@ -47,7 +47,7 @@ public class EntitySearchUtils {
public Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber,
boolean local, boolean isDossierDictionary) {
boolean isDictionaryEntry, boolean isDossierDictionary) {
Set<Entity> found = new HashSet<>();
@ -67,7 +67,7 @@ public class EntitySearchUtils {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary));
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, isDictionaryEntry, isDossierDictionary));
}
} while (startIndex > -1);
}