RED-6204: Moved code to its own class for metrics.

Moved a private method to find entities to its own class, so that it can produce a separate metric value.
This commit is contained in:
Viktor Seifert 2023-02-22 17:45:30 +01:00
parent f9b7ad4e3e
commit 00ef0eb677
3 changed files with 264 additions and 184 deletions

View File

@ -50,6 +50,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import com.iqser.red.service.redaction.v1.server.redaction.service.entityredaction.EntityRedactionService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
import com.iqser.red.service.redaction.v1.server.segmentation.ImageService;

View File

@ -0,0 +1,127 @@
package com.iqser.red.service.redaction.v1.server.redaction.service.entityredaction;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import io.micrometer.core.annotation.Timed;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Component
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
class EntityFinder {
RedactionServiceSettings redactionServiceSettings;
@Timed("redactmanager_findEntities")
public Entities findEntities(SearchableText searchableText,
String headline,
int sectionNumber,
Dictionary dictionary,
boolean local,
NerEntities nerEntities,
List<Integer> cellStarts,
ManualRedactions manualRedactions) {
Set<Entity> found = new HashSet<>();
String searchableString = searchableText.asString();
if (StringUtils.isEmpty(searchableString)) {
return new Entities(new HashSet<>(), new HashSet<>());
}
String lowercaseInputString = searchableString.toLowerCase();
for (DictionaryModel model : dictionary.getDictionaryModels()) {
var searchImplementation = local ? model.getLocalSearch() : model.getEntriesSearch();
var entities = EntitySearchUtils.findEntities(model.isCaseInsensitive() ? lowercaseInputString : searchableString,
searchImplementation,
model,
new FindEntityDetails(model.getType(),
headline,
sectionNumber,
!local,
model.isDossierDictionary(),
local ? Engine.RULE : Engine.DICTIONARY,
local ? EntityType.RECOMMENDATION : EntityType.ENTITY));
EntitySearchUtils.addOrAddEngine(found, entities);
}
Set<Entity> nerFound = new HashSet<>();
if (!local) {
nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellStarts, headline));
}
var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
return new Entities(cleared.stream().filter(e -> !e.isFalsePositive()).collect(Collectors.toSet()), nerFound);
}
private Set<Entity> getNerValues(int sectionNumber, NerEntities nerEntities, List<Integer> cellStarts, String headline) {
Set<Entity> entities = new HashSet<>();
if (redactionServiceSettings.isNerServiceEnabled() && nerEntities.getData().containsKey(sectionNumber)) {
nerEntities.getData().get(sectionNumber).forEach(res -> {
if (cellStarts == null || cellStarts.isEmpty()) {
entities.add(new Entity(res.getValue(),
res.getType(),
res.getStartOffset(),
res.getEndOffset(),
headline,
sectionNumber,
false,
false,
Engine.NER,
EntityType.RECOMMENDATION));
} else {
boolean intersectsCellStart = false;
for (Integer cellStart : cellStarts) {
if (res.getStartOffset() < cellStart && cellStart < res.getEndOffset()) {
intersectsCellStart = true;
break;
}
}
if (!intersectsCellStart) {
entities.add(new Entity(res.getValue(),
res.getType(),
res.getStartOffset(),
res.getEndOffset(),
headline,
sectionNumber,
false,
false,
Engine.NER,
EntityType.RECOMMENDATION));
}
}
});
}
return entities;
}
}

View File

@ -1,41 +1,52 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
package com.iqser.red.service.redaction.v1.server.redaction.service.entityredaction;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.IdRemoval;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualImageRecategorization;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.FindEntitiesResult;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService;
import com.iqser.red.service.redaction.v1.server.redaction.service.SurroundingWordsService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import io.micrometer.core.annotation.Timed;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class EntityRedactionService {
private final RedactionServiceSettings redactionServiceSettings;
private final DroolsExecutionService droolsExecutionService;
private final SurroundingWordsService surroundingWordsService;
DroolsExecutionService droolsExecutionService;
SurroundingWordsService surroundingWordsService;
EntityFinder entityFinder;
public PageEntities findEntities(Dictionary dictionary, List<SectionText> sectionTexts, KieContainer kieContainer, AnalyzeRequest analyzeRequest, NerEntities nerEntities) {
@ -45,7 +56,7 @@ public class EntityRedactionService {
if (dictionary.hasLocalEntries() || !findEntitiesResult.getAddedFileAttributes().isEmpty()) {
if(!findEntitiesResult.getAddedFileAttributes().isEmpty()) {
if (!findEntitiesResult.getAddedFileAttributes().isEmpty()) {
//AnalyzeRequest provides immutable list.
List<FileAttribute> mergedFileAttributes = new ArrayList<>();
mergedFileAttributes.addAll(analyzeRequest.getFileAttributes());
@ -54,7 +65,14 @@ public class EntityRedactionService {
}
Map<Integer, Set<Entity>> hintsPerSectionNumber = getHintsPerSection(findEntitiesResult.getEntities(), dictionary);
FindEntitiesResult foundByLocalEntitiesResult = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, true, hintsPerSectionNumber, imagesPerPage, nerEntities);
FindEntitiesResult foundByLocalEntitiesResult = findEntities(sectionTexts,
dictionary,
kieContainer,
analyzeRequest,
true,
hintsPerSectionNumber,
imagesPerPage,
nerEntities);
EntitySearchUtils.addEntitiesWithHigherRank(findEntitiesResult.getEntities(), foundByLocalEntitiesResult.getEntities(), dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(findEntitiesResult.getEntities());
}
@ -67,18 +85,78 @@ public class EntityRedactionService {
public FindEntitiesResult findEntities(List<SectionText> reanalysisSections,
Dictionary dictionary,
KieContainer kieContainer,
AnalyzeRequest analyzeRequest,
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
Map<Integer, Set<Image>> imagesPerPage,
NerEntities nerEntities) {
Dictionary dictionary,
KieContainer kieContainer,
AnalyzeRequest analyzeRequest,
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
Map<Integer, Set<Image>> imagesPerPage,
NerEntities nerEntities) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = extractSearchableTextPairs(reanalysisSections,
dictionary,
analyzeRequest,
local,
hintsPerSectionNumber,
nerEntities);
Entities entities = findEntities(reanalysisSection.getSearchableText(),
Set<FileAttribute> addedFileAttributes = new HashSet<>();
Set<Entity> entities = new HashSet<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
if (!addedFileAttributes.isEmpty()) {
//Section.Builder provides immutable list.
List<FileAttribute> mergedFileAttributes = new ArrayList<>();
mergedFileAttributes.addAll(sectionSearchableTextPair.getSection().getAddedFileAttributes());
mergedFileAttributes.addAll(addedFileAttributes);
sectionSearchableTextPair.getSection().setFileAttributes(mergedFileAttributes);
}
Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
addedFileAttributes.addAll(analysedSection.getAddedFileAttributes());
EntitySearchUtils.removeEntitiesContainedInLarger(analysedSection.getEntities());
var entriesWithoutSurroundingText = analysedSection.getEntities()
.stream()
.filter(e -> e.getTextAfter() == null && e.getTextBefore() == null)
.collect(Collectors.toSet());
if (sectionSearchableTextPair.getCellStarts() != null && !sectionSearchableTextPair.getCellStarts().isEmpty()) {
surroundingWordsService.addSurroundingText(entriesWithoutSurroundingText,
sectionSearchableTextPair.getSearchableText(),
dictionary,
sectionSearchableTextPair.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entriesWithoutSurroundingText, sectionSearchableTextPair.getSearchableText(), dictionary);
}
entities.addAll(analysedSection.getEntities());
if (!local) {
for (Image image : analysedSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
addLocalValuesToDictionary(analysedSection, dictionary);
}
});
return FindEntitiesResult.builder().entities(entities).addedFileAttributes(addedFileAttributes).build();
}
private List<SectionSearchableTextPair> extractSearchableTextPairs(List<SectionText> reanalysisSections,
Dictionary dictionary,
AnalyzeRequest analyzeRequest,
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
NerEntities nerEntities) {
return reanalysisSections.stream().map(reanalysisSection -> {
Entities entities = entityFinder.findEntities(reanalysisSection.getSearchableText(),
reanalysisSection.getHeadline(),
reanalysisSection.getSectionNumber(),
dictionary,
@ -136,73 +214,36 @@ public class EntityRedactionService {
log.debug("Section {}, Images: {}", reanalysisSection.getSectionNumber(), reanalysisSection.getImages());
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream.concat(entities.getEntities()
.stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber()).stream()).collect(Collectors.toSet()) : entities.getEntities())
.nerEntities(entities.getNerEntities())
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.sectionAreas(reanalysisSection.getSectionAreas())
.fileAttributes(analyzeRequest.getFileAttributes())
.manualRedactions(analyzeRequest.getManualRedactions())
.isInTable(reanalysisSection.isTable())
.build(), reanalysisSection.getSearchableText(), reanalysisSection.getCellStarts()));
}
return toSectionSearchableTextPair(dictionary, analyzeRequest, hintsPerSectionNumber, reanalysisSection, entities);
}).collect(Collectors.toList());
}
Set<FileAttribute> addedFileAttributes = new HashSet<>();
Set<Entity> entities = new HashSet<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
private SectionSearchableTextPair toSectionSearchableTextPair(Dictionary dictionary,
AnalyzeRequest analyzeRequest,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
SectionText reanalysisSection,
Entities entities) {
if(!addedFileAttributes.isEmpty()) {
//Section.Builder provides immutable list.
List<FileAttribute> mergedFileAttributes = new ArrayList<>();
mergedFileAttributes.addAll(sectionSearchableTextPair.getSection().getAddedFileAttributes());
mergedFileAttributes.addAll(addedFileAttributes);
sectionSearchableTextPair.getSection().setFileAttributes(mergedFileAttributes);
}
Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
addedFileAttributes.addAll(analysedSection.getAddedFileAttributes());
EntitySearchUtils.removeEntitiesContainedInLarger(analysedSection.getEntities());
var entriesWithoutSurroundingText = analysedSection.getEntities()
.stream()
.filter(e -> e.getTextAfter() == null && e.getTextBefore() == null)
.collect(Collectors.toSet());
if (sectionSearchableTextPair.getCellStarts() != null && !sectionSearchableTextPair.getCellStarts().isEmpty()) {
surroundingWordsService.addSurroundingText(entriesWithoutSurroundingText,
sectionSearchableTextPair.getSearchableText(),
dictionary,
sectionSearchableTextPair.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entriesWithoutSurroundingText, sectionSearchableTextPair.getSearchableText(), dictionary);
}
entities.addAll(analysedSection.getEntities());
if (!local) {
for (Image image : analysedSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
addLocalValuesToDictionary(analysedSection, dictionary);
}
});
return FindEntitiesResult.builder().entities(entities).addedFileAttributes(addedFileAttributes).build();
return new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream.concat(entities.getEntities().stream(),
hintsPerSectionNumber.get(reanalysisSection.getSectionNumber()).stream()).collect(Collectors.toSet()) : entities.getEntities())
.nerEntities(entities.getNerEntities())
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.sectionAreas(reanalysisSection.getSectionAreas())
.fileAttributes(analyzeRequest.getFileAttributes())
.manualRedactions(analyzeRequest.getManualRedactions())
.isInTable(reanalysisSection.isTable())
.build(), reanalysisSection.getSearchableText(), reanalysisSection.getCellStarts());
}
@ -244,7 +285,7 @@ public class EntityRedactionService {
private Map<Integer, Set<Entity>> getHintsPerSection(Set<Entity> entities, Dictionary dictionary) {
Map<Integer, Set<Entity>> hintsPerSectionNumber = new HashMap<>();
entities.stream().forEach(entity -> {
entities.forEach(entity -> {
if (dictionary.isHint(entity.getType()) && entity.isDictionaryEntry()) {
hintsPerSectionNumber.computeIfAbsent(entity.getSectionNumber(), (x) -> new HashSet<>()).add(entity);
}
@ -269,93 +310,4 @@ public class EntityRedactionService {
}));
}
@Timed("redactmanager_findEntities")
private Entities findEntities(SearchableText searchableText,
String headline,
int sectionNumber,
Dictionary dictionary,
boolean local,
NerEntities nerEntities,
List<Integer> cellStarts,
ManualRedactions manualRedactions) {
Set<Entity> found = new HashSet<>();
String searchableString = searchableText.asString();
if (StringUtils.isEmpty(searchableString)) {
return new Entities(new HashSet<>(), new HashSet<>());
}
String lowercaseInputString = searchableString.toLowerCase();
for (DictionaryModel model : dictionary.getDictionaryModels()) {
var searchImplementation = local ? model.getLocalSearch() : model.getEntriesSearch();
var entities = EntitySearchUtils.findEntities(model.isCaseInsensitive() ? lowercaseInputString : searchableString,
searchImplementation,
model,
new FindEntityDetails(model.getType(),
headline,
sectionNumber,
!local,
model.isDossierDictionary(),
local ? Engine.RULE : Engine.DICTIONARY,
local ? EntityType.RECOMMENDATION : EntityType.ENTITY));
EntitySearchUtils.addOrAddEngine(found, entities);
}
Set<Entity> nerFound = new HashSet<>();
if (!local) {
nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellStarts, headline));
}
var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
return new Entities(cleared.stream().filter(e -> !e.isFalsePositive()).collect(Collectors.toSet()), nerFound);
}
private Set<Entity> getNerValues(int sectionNumber, NerEntities nerEntities, List<Integer> cellStarts, String headline) {
Set<Entity> entities = new HashSet<>();
if (redactionServiceSettings.isNerServiceEnabled() && nerEntities.getData().containsKey(sectionNumber)) {
nerEntities.getData().get(sectionNumber).forEach(res -> {
if (cellStarts == null || cellStarts.isEmpty()) {
entities.add(new Entity(res.getValue(),
res.getType(),
res.getStartOffset(),
res.getEndOffset(),
headline,
sectionNumber,
false,
false,
Engine.NER,
EntityType.RECOMMENDATION));
} else {
boolean intersectsCellStart = false;
for (Integer cellStart : cellStarts) {
if (res.getStartOffset() < cellStart && cellStart < res.getEndOffset()) {
intersectsCellStart = true;
break;
}
}
if (!intersectsCellStart) {
entities.add(new Entity(res.getValue(),
res.getType(),
res.getStartOffset(),
res.getEndOffset(),
headline,
sectionNumber,
false,
false,
Engine.NER,
EntityType.RECOMMENDATION));
}
}
});
}
return entities;
}
}