RED-3992 - Replaced naive approach with aho corassick string search. Cleaned up code. reduced ammount of unnecesary conversions/invocations
This commit is contained in:
parent
729a7334d9
commit
747323f882
@ -2,9 +2,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
@ -26,11 +26,10 @@ public class DictionaryModel implements Serializable {
|
||||
private final Set<DictionaryEntry> falsePositives;
|
||||
private final Set<DictionaryEntry> falseRecommendations;
|
||||
|
||||
private transient Trie entriesTrie;
|
||||
private transient Trie falsePositivesTrie;
|
||||
private transient Trie falseRecommendationsTrie;
|
||||
|
||||
private transient Trie localEntriesTrie;
|
||||
private transient SearchImplementation entriesSearch;
|
||||
private transient SearchImplementation falsePositiveSearch;
|
||||
private transient SearchImplementation falseRecommendationsSearch;
|
||||
private transient SearchImplementation localSearch;
|
||||
|
||||
private final Set<String> localEntries = new HashSet<>();
|
||||
|
||||
@ -54,62 +53,38 @@ public class DictionaryModel implements Serializable {
|
||||
this.falsePositives = falsePositives;
|
||||
this.falseRecommendations = falseRecommendations;
|
||||
|
||||
this.entriesTrie = buildTrie(entries);
|
||||
this.falsePositivesTrie = buildTrie(falsePositives);
|
||||
this.falseRecommendationsTrie = buildTrie(falseRecommendations);
|
||||
this.entriesSearch = new SearchImplementation(this.entries.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
|
||||
this.falsePositiveSearch = new SearchImplementation(this.falsePositives.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
|
||||
this.falseRecommendationsSearch = new SearchImplementation(this.falseRecommendations.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public Trie getLocalEntriesTrie() {
|
||||
if (localEntriesTrie == null) {
|
||||
this.localEntriesTrie = buildTrieFromStrings(this.localEntries);
|
||||
public SearchImplementation getLocalSearch() {
|
||||
if (this.localSearch == null) {
|
||||
this.localSearch = new SearchImplementation(this.localEntries, caseInsensitive);
|
||||
}
|
||||
return localEntriesTrie;
|
||||
return this.localSearch;
|
||||
}
|
||||
|
||||
|
||||
public Trie getEntriesTrie() {
|
||||
if (entriesTrie == null) {
|
||||
this.entriesTrie = buildTrie(this.entries);
|
||||
public SearchImplementation getEntriesSearch() {
|
||||
if (entriesSearch == null) {
|
||||
this.entriesSearch = new SearchImplementation(this.entries.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
|
||||
}
|
||||
return entriesTrie;
|
||||
return entriesSearch;
|
||||
}
|
||||
|
||||
public Trie getFalsePositivesTrie() {
|
||||
if (falsePositivesTrie == null) {
|
||||
this.falsePositivesTrie = buildTrie(this.falsePositives);
|
||||
public SearchImplementation getFalsePositiveSearch() {
|
||||
if (falsePositiveSearch == null) {
|
||||
this.falsePositiveSearch = new SearchImplementation(this.falsePositives.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
|
||||
}
|
||||
return falsePositivesTrie;
|
||||
return falsePositiveSearch;
|
||||
}
|
||||
|
||||
public Trie getFalseRecommendationsTrie() {
|
||||
if (falsePositivesTrie == null) {
|
||||
this.falsePositivesTrie = buildTrie(this.falseRecommendations);
|
||||
public SearchImplementation getFalseRecommendationsSearch() {
|
||||
if (falseRecommendationsSearch == null) {
|
||||
this.falseRecommendationsSearch = new SearchImplementation(this.falseRecommendations.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
|
||||
}
|
||||
return falsePositivesTrie;
|
||||
}
|
||||
|
||||
private Trie buildTrieFromStrings(Set<String> entries) {
|
||||
var builder = Trie.builder()
|
||||
.addKeywords(entries);
|
||||
|
||||
if (this.isCaseInsensitive()) {
|
||||
builder.ignoreCase();
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
private Trie buildTrie(Set<DictionaryEntry> values) {
|
||||
var builder = Trie.builder()
|
||||
.addKeywords(values.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()));
|
||||
|
||||
if (this.isCaseInsensitive()) {
|
||||
builder.ignoreCase();
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
return falseRecommendationsSearch;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -63,11 +64,11 @@ public class Section {
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
@WhenCondition
|
||||
public void addAiEntities(String type, String asType) {
|
||||
public void addAiEntities(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.TYPE) String asType) {
|
||||
|
||||
Set<Entity> entitiesOfType = nerEntities.stream().filter(nerEntity -> nerEntity.getType().equals(type)).collect(Collectors.toSet());
|
||||
List<String> values = entitiesOfType.stream().map(Entity::getWord).collect(Collectors.toList());
|
||||
Set<Entity> found = EntitySearchUtils.findEntities(searchText, values, dictionary.getType(asType), new FindEntityDetails(asType, headline, sectionNumber, false, false, Engine.NER, EntityType.RECOMMENDATION));
|
||||
Set<Entity> found = EntitySearchUtils.findEntities(searchText, new SearchImplementation(values, dictionary.isCaseInsensitiveDictionary(asType)), dictionary.getType(asType), new FindEntityDetails(asType, headline, sectionNumber, false, false, Engine.NER, EntityType.RECOMMENDATION));
|
||||
EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary);
|
||||
|
||||
Set<Entity> finalResult = new HashSet<>();
|
||||
@ -94,7 +95,9 @@ public class Section {
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
@WhenCondition
|
||||
public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType, int minPartMatches, boolean allowDuplicateTypes) {
|
||||
public void combineAiTypes(@Argument(ArgumentType.TYPE) String startType, @Argument(ArgumentType.TYPE) String combineTypes,
|
||||
@Argument(ArgumentType.INTEGER) int maxDistanceBetween, @Argument(ArgumentType.TYPE) String asType,
|
||||
@Argument(ArgumentType.INTEGER) int minPartMatches, @Argument(ArgumentType.BOOLEAN) boolean allowDuplicateTypes) {
|
||||
|
||||
Set<String> combineSet = Set.of(combineTypes.split(","));
|
||||
|
||||
@ -276,6 +279,7 @@ public class Section {
|
||||
|
||||
Set<Entity> expanded = new HashSet<>();
|
||||
for (var entity : entities) {
|
||||
System.out.println(entity.getWord());
|
||||
|
||||
if (!entity.getType().equals(type) || entity.getTextBefore() == null) {
|
||||
continue;
|
||||
@ -585,14 +589,14 @@ public class Section {
|
||||
|
||||
@ThenAction
|
||||
@SuppressWarnings("unused")
|
||||
public void ignore(String type) {
|
||||
public void ignore(@Argument(ArgumentType.TYPE) String type) {
|
||||
|
||||
entities.removeIf(entity -> entity.getType().equals(type) && entity.getEntityType().equals(EntityType.ENTITY));
|
||||
}
|
||||
|
||||
@ThenAction
|
||||
@SuppressWarnings("unused")
|
||||
public void ignoreRecommendations(String type) {
|
||||
public void ignoreRecommendations(@Argument(ArgumentType.TYPE) String type) {
|
||||
|
||||
entities.removeIf(entity -> entity.getType().equals(type) && entity.getEntityType().equals(EntityType.RECOMMENDATION));
|
||||
}
|
||||
@ -708,9 +712,7 @@ public class Section {
|
||||
private Set<Entity> findEntities(String value, String asType, boolean caseInsensitive, boolean redacted, int ruleNumber, String reason, String legalBasis, Engine engine, boolean asRecommendation) {
|
||||
|
||||
String text = caseInsensitive ? searchText.toLowerCase() : searchText;
|
||||
String searchValue = caseInsensitive ? value.toLowerCase() : value;
|
||||
|
||||
Set<Entity> found = EntitySearchUtils.findEntities(text, List.of(searchValue), dictionary.getType(asType),
|
||||
Set<Entity> found = EntitySearchUtils.findEntities(text, new SearchImplementation(value, caseInsensitive), dictionary.getType(asType),
|
||||
new FindEntityDetails(asType, headline, sectionNumber, false, false, engine, asRecommendation ? EntityType.RECOMMENDATION : EntityType.ENTITY));
|
||||
found.forEach(entity -> {
|
||||
if (redacted) {
|
||||
|
||||
@ -1,33 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.service.ImportedRedactionService.IMPORTED_REDACTION_TYPE;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.IdRemoval;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualForceRedaction;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualImageRecategorization;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualLegalBasisChange;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLog;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
|
||||
@ -35,21 +14,25 @@ import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.ImageService;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.service.ImportedRedactionService.IMPORTED_REDACTION_TYPE;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -131,10 +114,14 @@ public class AnalyzeService {
|
||||
return analyze(analyzeRequest);
|
||||
}
|
||||
|
||||
var dis = System.currentTimeMillis();
|
||||
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getDossierTemplateId(), new DictionaryVersion(redactionLog.getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getDossierId());
|
||||
log.info("Dictionary Increment time time: {} ms", (System.currentTimeMillis() - dis));
|
||||
|
||||
var fis = System.currentTimeMillis();
|
||||
Set<Integer> sectionsToReanalyse = !analyzeRequest.getSectionsToReanalyse()
|
||||
.isEmpty() ? analyzeRequest.getSectionsToReanalyse() : findSectionsToReanalyse(dictionaryIncrement, redactionLog, text, analyzeRequest);
|
||||
log.info("Find sections time: {} ms", (System.currentTimeMillis() - fis));
|
||||
|
||||
if (sectionsToReanalyse.isEmpty()) {
|
||||
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
|
||||
@ -152,21 +139,33 @@ public class AnalyzeService {
|
||||
.filter(sectionText -> sectionsToReanalyse.contains(sectionText.getSectionNumber()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
long kis = System.currentTimeMillis();
|
||||
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId());
|
||||
log.info("Kie time: {} ms", (System.currentTimeMillis() - kis));
|
||||
|
||||
long dds = System.currentTimeMillis();
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
|
||||
log.info("Dict Time time: {} ms", (System.currentTimeMillis() - dds));
|
||||
|
||||
long pis = System.currentTimeMillis();
|
||||
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, reanalysisSections, kieContainer, analyzeRequest, nerEntities);
|
||||
log.info("Find Entities time: {}", (System.currentTimeMillis() - pis));
|
||||
|
||||
long crs = System.currentTimeMillis();
|
||||
var newRedactionLogEntries = redactionLogCreatorService.createRedactionLog(pageEntities, text.getNumberOfPages(), analyzeRequest.getDossierTemplateId());
|
||||
|
||||
log.info("Create Redaction-log time: {} ms", (System.currentTimeMillis() - crs));
|
||||
long prs = System.currentTimeMillis();
|
||||
var importedRedactionFilteredEntries = importedRedactionService.processImportedRedactions(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId(), analyzeRequest.getFileId(), newRedactionLogEntries, false);
|
||||
|
||||
log.info("Process imports time: {} ms", (System.currentTimeMillis() - prs));
|
||||
redactionLog.getRedactionLogEntry()
|
||||
.removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.getType()
|
||||
.equals(IMPORTED_REDACTION_TYPE));
|
||||
redactionLog.getRedactionLogEntry().addAll(importedRedactionFilteredEntries);
|
||||
|
||||
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
|
||||
var fls = System.currentTimeMillis();
|
||||
var x = finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
|
||||
log.info("Finalize time: {} ms", (System.currentTimeMillis() - fls));
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
@ -219,13 +218,19 @@ public class AnalyzeService {
|
||||
}
|
||||
}
|
||||
|
||||
long ss = System.currentTimeMillis();
|
||||
|
||||
var dictionaryIncrementsSearch = new SearchImplementation(dictionaryIncrement.getValues().stream()
|
||||
.map(DictionaryIncrementValue::getValue).collect(Collectors.toList()), true);
|
||||
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
|
||||
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
|
||||
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrementsSearch)) {
|
||||
sectionsToReanalyse.add(sectionText.getSectionNumber());
|
||||
}
|
||||
|
||||
}
|
||||
log.info("Section Find time: {}", (System.currentTimeMillis() - ss));
|
||||
|
||||
log.info("Should reanalyze {} sections for request: {}, took: {}", sectionsToReanalyse.size(), analyzeRequest, System.currentTimeMillis() - start);
|
||||
|
||||
|
||||
@ -218,9 +218,9 @@ public class EntityRedactionService {
|
||||
String lowercaseInputString = searchableString.toLowerCase();
|
||||
for (DictionaryModel model : dictionary.getDictionaryModels()) {
|
||||
|
||||
var trie = local ? model.getLocalEntriesTrie() : model.getEntriesTrie();
|
||||
var searchImplementation = local ? model.getLocalSearch() : model.getEntriesSearch();
|
||||
var entities = EntitySearchUtils.findEntities(model.isCaseInsensitive() ? lowercaseInputString : searchableString,
|
||||
trie, model, new FindEntityDetails(model.getType(),headline, sectionNumber, !local, model.isDossierDictionary(), local ? Engine.RULE : Engine.DICTIONARY, local? EntityType.RECOMMENDATION: EntityType.ENTITY));
|
||||
searchImplementation, model, new FindEntityDetails(model.getType(),headline, sectionNumber, !local, model.isDossierDictionary(), local ? Engine.RULE : Engine.DICTIONARY, local? EntityType.RECOMMENDATION: EntityType.ENTITY));
|
||||
|
||||
EntitySearchUtils.addOrAddEngine(found, entities);
|
||||
}
|
||||
|
||||
@ -1,14 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.Rectangle;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualRedactionEntry;
|
||||
@ -23,10 +14,17 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -90,10 +88,10 @@ public class ManualRedactionSurroundingTextService {
|
||||
}
|
||||
|
||||
|
||||
private Pair<String, String> findSurroundingText(SectionText sectionText, String value,
|
||||
List<Rectangle> toFindPositions) {
|
||||
private Pair<String, String> findSurroundingText(SectionText sectionText, String value, List<Rectangle> toFindPositions) {
|
||||
|
||||
Set<Entity> entities = EntitySearchUtils.find(sectionText.getText(), value,new FindEntityDetails( "dummy", sectionText.getHeadline(), sectionText.getSectionNumber(), false, false, Engine.DICTIONARY, EntityType.ENTITY));
|
||||
Set<Entity> entities = EntitySearchUtils.find(sectionText.getText(), new SearchImplementation(value, false),
|
||||
new FindEntityDetails("dummy", sectionText.getHeadline(), sectionText.getSectionNumber(), false, false, Engine.DICTIONARY, EntityType.ENTITY));
|
||||
Set<Entity> entitiesWithPositions = EntitySearchUtils.clearAndFindPositions(entities, sectionText.getSearchableText(), null);
|
||||
|
||||
Entity correctEntity = getEntityOnCorrectPosition(entitiesWithPositions, toFindPositions);
|
||||
|
||||
@ -43,8 +43,6 @@ public class RedactionLogMergeService {
|
||||
throw new NotFoundException("RedactionLog not present");
|
||||
}
|
||||
|
||||
log.info("Loaded redaction log with computationalVersion: {}", redactionLog.getAnalysisVersion());
|
||||
|
||||
SectionGrid sectionGrid = redactionStorageService.getSectionGrid(redactionRequest.getDossierId(), redactionRequest.getFileId());
|
||||
if (sectionGrid.getSections().isEmpty()) {
|
||||
|
||||
@ -62,7 +60,7 @@ public class RedactionLogMergeService {
|
||||
redactionStorageService.storeObject(redactionRequest.getDossierId(), redactionRequest.getFileId(), FileType.SECTION_GRID, sectionGrid);
|
||||
}
|
||||
|
||||
log.info("Loaded redaction log with computationalVersion: {}", redactionLog.getAnalysisVersion());
|
||||
log.debug("Loaded redaction log with computationalVersion: {}", redactionLog.getAnalysisVersion());
|
||||
var merged = mergeRedactionLogData(redactionLog, sectionGrid, redactionRequest.getManualRedactions(), redactionRequest.getExcludedPages(), redactionRequest.getTypes(), redactionRequest.getColors());
|
||||
|
||||
merged.getRedactionLogEntry().removeIf(e -> e.isFalsePositive() && !redactionRequest.isIncludeFalsePositives());
|
||||
|
||||
@ -16,34 +16,20 @@ import java.util.stream.Collectors;
|
||||
public class EntitySearchUtils {
|
||||
|
||||
|
||||
public boolean sectionContainsAny(String sectionText, Set<DictionaryIncrementValue> values) {
|
||||
var trie = Trie.builder().ignoreCase().addKeywords(values.stream().map(DictionaryIncrementValue::getValue).collect(Collectors.toList())).build();
|
||||
return trie.containsMatch(sectionText.toLowerCase(Locale.ROOT));
|
||||
public boolean sectionContainsAny(String sectionText, SearchImplementation searchImplementation) {
|
||||
return searchImplementation.atLeastOneMatches(sectionText);
|
||||
}
|
||||
|
||||
public Set<Entity> findEntities(String inputString, List<String> values, DictionaryModel type, FindEntityDetails details) {
|
||||
public Set<Entity> findEntities(String inputString, SearchImplementation searchImplementation, DictionaryModel type, FindEntityDetails details) {
|
||||
|
||||
var builder = Trie.builder()
|
||||
.addKeywords(values);
|
||||
|
||||
if (type.isCaseInsensitive()) {
|
||||
builder.ignoreCase();
|
||||
}
|
||||
|
||||
return findEntities(inputString, builder.build(), type, details);
|
||||
}
|
||||
|
||||
|
||||
public Set<Entity> findEntities(String inputString, Trie trie, DictionaryModel type, FindEntityDetails details) {
|
||||
|
||||
Set<Entity> found = find(inputString, trie, details);
|
||||
Set<Entity> found = find(inputString, searchImplementation, details);
|
||||
|
||||
if (details.getEntityType() == EntityType.RECOMMENDATION) {
|
||||
Set<Entity> falseRecommendations = find(inputString, type.getFalseRecommendationsTrie(), details.withEntityType(EntityType.FALSE_RECOMMENDATION));
|
||||
Set<Entity> falseRecommendations = find(inputString, type.getFalseRecommendationsSearch(), details.withEntityType(EntityType.FALSE_RECOMMENDATION));
|
||||
removeFalsePositives(found, falseRecommendations);
|
||||
found.addAll(falseRecommendations);
|
||||
} else {
|
||||
Set<Entity> falsePositives = find(inputString, type.getFalsePositivesTrie(), details.withEntityType(EntityType.FALSE_POSITIVE));
|
||||
Set<Entity> falsePositives = find(inputString, type.getFalsePositiveSearch(), details.withEntityType(EntityType.FALSE_POSITIVE));
|
||||
removeFalsePositives(found, falsePositives);
|
||||
found.addAll(falsePositives);
|
||||
}
|
||||
@ -51,33 +37,10 @@ public class EntitySearchUtils {
|
||||
return found;
|
||||
}
|
||||
|
||||
public Set<Entity> find(String inputString, String value, FindEntityDetails findEntityDetails) {
|
||||
|
||||
var trie = Trie.builder()
|
||||
.addKeywords(value).build();
|
||||
|
||||
public Set<Entity> find(String inputString, SearchImplementation searchImplementation, FindEntityDetails findEntityDetails) {
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
|
||||
trie.parseText(inputString).forEach(found -> {
|
||||
var startIndex = found.getStart();
|
||||
var stopIndex = found.getEnd() + 1;
|
||||
validateAndAddEntity(entities, findEntityDetails, inputString, startIndex, stopIndex);
|
||||
|
||||
});
|
||||
return entities;
|
||||
}
|
||||
|
||||
|
||||
public Set<Entity> find(String inputString, Trie trie, FindEntityDetails findEntityDetails) {
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
|
||||
var matches = trie.parseText(inputString);
|
||||
|
||||
matches.forEach(match -> {
|
||||
var startIndex = match.getStart();
|
||||
var stopIndex = match.getEnd() + 1;
|
||||
validateAndAddEntity(entities, findEntityDetails, inputString, startIndex, stopIndex);
|
||||
});
|
||||
searchImplementation.getMatches(inputString).forEach(match -> validateAndAddEntity(entities, findEntityDetails, inputString, match.getStartIndex(), match.getEndIndex()));
|
||||
|
||||
return entities;
|
||||
}
|
||||
|
||||
@ -0,0 +1,92 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Data
|
||||
public class SearchImplementation {
|
||||
|
||||
private boolean ignoreCase;
|
||||
private List<String> values;
|
||||
|
||||
private Pattern pattern;
|
||||
private Trie trie;
|
||||
|
||||
public SearchImplementation(String value, boolean ignoreCase) {
|
||||
this.values = List.of(value);
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.createSearchImplementation();
|
||||
}
|
||||
|
||||
public SearchImplementation(Collection<String> values, boolean ignoreCase) {
|
||||
this.values = new ArrayList<>(values);
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.createSearchImplementation();
|
||||
}
|
||||
|
||||
private void createSearchImplementation() {
|
||||
if (this.values.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.values.size() == 1) {
|
||||
var text = this.values.iterator().next();
|
||||
this.pattern = Pattern.compile(Pattern.quote(ignoreCase ? text.toLowerCase(Locale.ROOT) : text));
|
||||
} else {
|
||||
var builder = Trie.builder();
|
||||
if (this.ignoreCase) {
|
||||
builder.ignoreCase();
|
||||
}
|
||||
|
||||
builder.addKeywords(this.values);
|
||||
|
||||
this.trie = builder.build();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean atLeastOneMatches(String text) {
|
||||
String textToCheck = text;
|
||||
if (this.values.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
if (this.pattern != null) {
|
||||
if (ignoreCase) {
|
||||
textToCheck = textToCheck.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
return this.pattern.matcher(textToCheck).results().findAny().isPresent();
|
||||
} else {
|
||||
return this.trie.containsMatch(textToCheck);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public List<MatchPosition> getMatches(String text) {
|
||||
String textToCheck = text;
|
||||
if (this.values.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
if (this.pattern != null) {
|
||||
if (ignoreCase) {
|
||||
textToCheck = textToCheck.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
return this.pattern.matcher(textToCheck).results().map(r -> new MatchPosition(r.start(), r.end())).collect(Collectors.toList());
|
||||
} else {
|
||||
return this.trie.parseText(textToCheck).stream().map(r -> new MatchPosition(r.getStart(), r.getEnd() + 1)).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public static class MatchPosition {
|
||||
private int startIndex;
|
||||
private int endIndex;
|
||||
}
|
||||
}
|
||||
@ -5,21 +5,21 @@ import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.MessageType;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest {
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testFile() {
|
||||
@ -28,16 +28,17 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
||||
om.registerModule(new JavaTimeModule());
|
||||
|
||||
var file = new ClassPathResource(BASE_DIR + "data/test-file.pdf").getInputStream();
|
||||
var nerData = new ClassPathResource(BASE_DIR + "data/test-file.ner.json").getInputStream();
|
||||
var text = new ClassPathResource(BASE_DIR + "data/test-file.text.json").getInputStream();
|
||||
var sectionText = new ClassPathResource(BASE_DIR + "data/test-file.section-grid.json").getInputStream();
|
||||
var redactionLog = new ClassPathResource(BASE_DIR + "data/test-file.redaction-log.json").getInputStream();
|
||||
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.ORIGIN, file);
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.NER_ENTITIES, nerData);
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.TEXT, text);
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.SECTION_GRID, sectionText);
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.REDACTION_LOG, redactionLog);
|
||||
|
||||
|
||||
try {
|
||||
var nerData = new ClassPathResource(BASE_DIR + "data/test-file.ner.json").getInputStream();
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.NER_ENTITIES, nerData);
|
||||
} catch (Exception e) {
|
||||
log.warn("No NER File Provided");
|
||||
redactionServiceSettings.setNerServiceEnabled(false);
|
||||
}
|
||||
|
||||
|
||||
AnalyzeRequest ar = AnalyzeRequest.builder()
|
||||
.fileId("fileId")
|
||||
@ -47,49 +48,38 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
||||
.lastProcessed(OffsetDateTime.now())
|
||||
.excludedPages(Set.of())
|
||||
.fileAttributes(List.of())
|
||||
.messageType(MessageType.STRUCTURE_ANALYSE)
|
||||
.build();
|
||||
|
||||
String in = om.writeValueAsString(ar);
|
||||
// redactionMessageReceiver.receiveAnalyzeRequest(in, false);
|
||||
// log.warn("done structure");
|
||||
|
||||
var txt = redactionStorageService.getText("dossierId", "fileId");
|
||||
try {
|
||||
var text = new ClassPathResource(BASE_DIR + "data/test-file.text.json").getInputStream();
|
||||
var sectionText = new ClassPathResource(BASE_DIR + "data/test-file.section-grid.json").getInputStream();
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.TEXT, text);
|
||||
redactionStorageService.storeObject("dossierId", "fileId", FileType.SECTION_GRID, sectionText);
|
||||
} catch (Exception e) {
|
||||
log.info("No text file provided, Performing Structure analysis");
|
||||
|
||||
var totalText = txt.getSectionTexts().stream().map(SectionText::getText).collect(Collectors.joining("\n"));
|
||||
System.out.println(totalText.length());
|
||||
ar.setMessageType(MessageType.STRUCTURE_ANALYSE);
|
||||
String in = om.writeValueAsString(ar);
|
||||
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
var redactionLog = new ClassPathResource(BASE_DIR + "data/test-file.redaction-log.json").getInputStream();
|
||||
} catch (Exception e) {
|
||||
log.info("No redaction log provided, Performing full analysis");
|
||||
|
||||
ar.setMessageType(MessageType.ANALYSE);
|
||||
String in = om.writeValueAsString(ar);
|
||||
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
|
||||
}
|
||||
|
||||
|
||||
// ar.setMessageType(MessageType.ANALYSE);
|
||||
// in = om.writeValueAsString(ar);
|
||||
// redactionMessageReceiver.receiveAnalyzeRequest(in, false);
|
||||
// log.warn("done analyze");
|
||||
|
||||
|
||||
simulateIncrement(List.of("study"),"PII",3L);
|
||||
|
||||
simulateIncrement(List.of("type"), "PII", 3L);
|
||||
ar.setMessageType(MessageType.REANALYSE);
|
||||
in = om.writeValueAsString(ar);
|
||||
String in = om.writeValueAsString(ar);
|
||||
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
|
||||
log.warn("done analyze");
|
||||
|
||||
var log = redactionStorageService.getRedactionLog("dossierId", "fileId");
|
||||
om.writeValue(new FileOutputStream("/tmp/test-file.redaction-log.json"), log);
|
||||
|
||||
System.out.println(log.getRedactionLogEntry().size());
|
||||
}
|
||||
|
||||
|
||||
// public static long hash(char[]){
|
||||
// return ((value % prime) + prime) % prime;
|
||||
// }
|
||||
// public static long getBiggerPrime() {
|
||||
// BigInteger prime = BigInteger.probablePrime(getNumberOfBits(10) + 1, new Random());
|
||||
// return prime.longValue();
|
||||
// }
|
||||
// private static int getNumberOfBits(int number) {
|
||||
// return Integer.SIZE - Integer.numberOfLeadingZeros(number);
|
||||
// }
|
||||
}
|
||||
|
||||
@ -12,6 +12,7 @@ import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.client.*;
|
||||
import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
@ -81,6 +82,9 @@ public class LiveDataIntegrationTest {
|
||||
@Autowired
|
||||
protected FileSystemBackedStorageService fileSystemBackedStorageService;
|
||||
|
||||
@Autowired
|
||||
protected RedactionServiceSettings redactionServiceSettings;
|
||||
|
||||
private List<Type> types;
|
||||
|
||||
@Configuration
|
||||
|
||||
File diff suppressed because one or more lines are too long
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user