RED-3992 - Replaced naive approach with aho corassick string search. Cleaned up code. reduced ammount of unnecesary conversions/invocations

This commit is contained in:
Timo Bejan 2022-05-09 08:06:04 +03:00
parent 729a7334d9
commit 747323f882
12 changed files with 225 additions and 199 deletions

View File

@ -2,9 +2,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry;
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
import lombok.AllArgsConstructor;
import lombok.Data;
import org.ahocorasick.trie.Trie;
import java.io.Serializable;
import java.util.HashSet;
@ -26,11 +26,10 @@ public class DictionaryModel implements Serializable {
private final Set<DictionaryEntry> falsePositives;
private final Set<DictionaryEntry> falseRecommendations;
private transient Trie entriesTrie;
private transient Trie falsePositivesTrie;
private transient Trie falseRecommendationsTrie;
private transient Trie localEntriesTrie;
private transient SearchImplementation entriesSearch;
private transient SearchImplementation falsePositiveSearch;
private transient SearchImplementation falseRecommendationsSearch;
private transient SearchImplementation localSearch;
private final Set<String> localEntries = new HashSet<>();
@ -54,62 +53,38 @@ public class DictionaryModel implements Serializable {
this.falsePositives = falsePositives;
this.falseRecommendations = falseRecommendations;
this.entriesTrie = buildTrie(entries);
this.falsePositivesTrie = buildTrie(falsePositives);
this.falseRecommendationsTrie = buildTrie(falseRecommendations);
this.entriesSearch = new SearchImplementation(this.entries.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
this.falsePositiveSearch = new SearchImplementation(this.falsePositives.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
this.falseRecommendationsSearch = new SearchImplementation(this.falseRecommendations.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
}
public Trie getLocalEntriesTrie() {
if (localEntriesTrie == null) {
this.localEntriesTrie = buildTrieFromStrings(this.localEntries);
public SearchImplementation getLocalSearch() {
if (this.localSearch == null) {
this.localSearch = new SearchImplementation(this.localEntries, caseInsensitive);
}
return localEntriesTrie;
return this.localSearch;
}
public Trie getEntriesTrie() {
if (entriesTrie == null) {
this.entriesTrie = buildTrie(this.entries);
public SearchImplementation getEntriesSearch() {
if (entriesSearch == null) {
this.entriesSearch = new SearchImplementation(this.entries.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
}
return entriesTrie;
return entriesSearch;
}
public Trie getFalsePositivesTrie() {
if (falsePositivesTrie == null) {
this.falsePositivesTrie = buildTrie(this.falsePositives);
public SearchImplementation getFalsePositiveSearch() {
if (falsePositiveSearch == null) {
this.falsePositiveSearch = new SearchImplementation(this.falsePositives.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
}
return falsePositivesTrie;
return falsePositiveSearch;
}
public Trie getFalseRecommendationsTrie() {
if (falsePositivesTrie == null) {
this.falsePositivesTrie = buildTrie(this.falseRecommendations);
public SearchImplementation getFalseRecommendationsSearch() {
if (falseRecommendationsSearch == null) {
this.falseRecommendationsSearch = new SearchImplementation(this.falseRecommendations.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()), caseInsensitive);
}
return falsePositivesTrie;
}
private Trie buildTrieFromStrings(Set<String> entries) {
var builder = Trie.builder()
.addKeywords(entries);
if (this.isCaseInsensitive()) {
builder.ignoreCase();
}
return builder.build();
}
private Trie buildTrie(Set<DictionaryEntry> values) {
var builder = Trie.builder()
.addKeywords(values.stream().filter(e -> !e.isDeleted()).map(DictionaryEntry::getValue).collect(Collectors.toList()));
if (this.isCaseInsensitive()) {
builder.ignoreCase();
}
return builder.build();
return falseRecommendationsSearch;
}

View File

@ -8,6 +8,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@ -63,11 +64,11 @@ public class Section {
@SuppressWarnings("unused")
@WhenCondition
public void addAiEntities(String type, String asType) {
public void addAiEntities(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.TYPE) String asType) {
Set<Entity> entitiesOfType = nerEntities.stream().filter(nerEntity -> nerEntity.getType().equals(type)).collect(Collectors.toSet());
List<String> values = entitiesOfType.stream().map(Entity::getWord).collect(Collectors.toList());
Set<Entity> found = EntitySearchUtils.findEntities(searchText, values, dictionary.getType(asType), new FindEntityDetails(asType, headline, sectionNumber, false, false, Engine.NER, EntityType.RECOMMENDATION));
Set<Entity> found = EntitySearchUtils.findEntities(searchText, new SearchImplementation(values, dictionary.isCaseInsensitiveDictionary(asType)), dictionary.getType(asType), new FindEntityDetails(asType, headline, sectionNumber, false, false, Engine.NER, EntityType.RECOMMENDATION));
EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary);
Set<Entity> finalResult = new HashSet<>();
@ -94,7 +95,9 @@ public class Section {
@SuppressWarnings("unused")
@WhenCondition
public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType, int minPartMatches, boolean allowDuplicateTypes) {
public void combineAiTypes(@Argument(ArgumentType.TYPE) String startType, @Argument(ArgumentType.TYPE) String combineTypes,
@Argument(ArgumentType.INTEGER) int maxDistanceBetween, @Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.INTEGER) int minPartMatches, @Argument(ArgumentType.BOOLEAN) boolean allowDuplicateTypes) {
Set<String> combineSet = Set.of(combineTypes.split(","));
@ -276,6 +279,7 @@ public class Section {
Set<Entity> expanded = new HashSet<>();
for (var entity : entities) {
System.out.println(entity.getWord());
if (!entity.getType().equals(type) || entity.getTextBefore() == null) {
continue;
@ -585,14 +589,14 @@ public class Section {
@ThenAction
@SuppressWarnings("unused")
public void ignore(String type) {
public void ignore(@Argument(ArgumentType.TYPE) String type) {
entities.removeIf(entity -> entity.getType().equals(type) && entity.getEntityType().equals(EntityType.ENTITY));
}
@ThenAction
@SuppressWarnings("unused")
public void ignoreRecommendations(String type) {
public void ignoreRecommendations(@Argument(ArgumentType.TYPE) String type) {
entities.removeIf(entity -> entity.getType().equals(type) && entity.getEntityType().equals(EntityType.RECOMMENDATION));
}
@ -708,9 +712,7 @@ public class Section {
private Set<Entity> findEntities(String value, String asType, boolean caseInsensitive, boolean redacted, int ruleNumber, String reason, String legalBasis, Engine engine, boolean asRecommendation) {
String text = caseInsensitive ? searchText.toLowerCase() : searchText;
String searchValue = caseInsensitive ? value.toLowerCase() : value;
Set<Entity> found = EntitySearchUtils.findEntities(text, List.of(searchValue), dictionary.getType(asType),
Set<Entity> found = EntitySearchUtils.findEntities(text, new SearchImplementation(value, caseInsensitive), dictionary.getType(asType),
new FindEntityDetails(asType, headline, sectionNumber, false, false, engine, asRecommendation ? EntityType.RECOMMENDATION : EntityType.ENTITY));
found.forEach(entity -> {
if (redacted) {

View File

@ -1,33 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import static com.iqser.red.service.redaction.v1.server.redaction.service.ImportedRedactionService.IMPORTED_REDACTION_TYPE;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.IdRemoval;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualForceRedaction;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualImageRecategorization;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualLegalBasisChange;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
@ -35,21 +14,25 @@ import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
import com.iqser.red.service.redaction.v1.server.segmentation.ImageService;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static com.iqser.red.service.redaction.v1.server.redaction.service.ImportedRedactionService.IMPORTED_REDACTION_TYPE;
@Slf4j
@Service
@ -131,10 +114,14 @@ public class AnalyzeService {
return analyze(analyzeRequest);
}
var dis = System.currentTimeMillis();
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getDossierTemplateId(), new DictionaryVersion(redactionLog.getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getDossierId());
log.info("Dictionary Increment time time: {} ms", (System.currentTimeMillis() - dis));
var fis = System.currentTimeMillis();
Set<Integer> sectionsToReanalyse = !analyzeRequest.getSectionsToReanalyse()
.isEmpty() ? analyzeRequest.getSectionsToReanalyse() : findSectionsToReanalyse(dictionaryIncrement, redactionLog, text, analyzeRequest);
log.info("Find sections time: {} ms", (System.currentTimeMillis() - fis));
if (sectionsToReanalyse.isEmpty()) {
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
@ -152,21 +139,33 @@ public class AnalyzeService {
.filter(sectionText -> sectionsToReanalyse.contains(sectionText.getSectionNumber()))
.collect(Collectors.toList());
long kis = System.currentTimeMillis();
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId());
log.info("Kie time: {} ms", (System.currentTimeMillis() - kis));
long dds = System.currentTimeMillis();
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
log.info("Dict Time time: {} ms", (System.currentTimeMillis() - dds));
long pis = System.currentTimeMillis();
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, reanalysisSections, kieContainer, analyzeRequest, nerEntities);
log.info("Find Entities time: {}", (System.currentTimeMillis() - pis));
long crs = System.currentTimeMillis();
var newRedactionLogEntries = redactionLogCreatorService.createRedactionLog(pageEntities, text.getNumberOfPages(), analyzeRequest.getDossierTemplateId());
log.info("Create Redaction-log time: {} ms", (System.currentTimeMillis() - crs));
long prs = System.currentTimeMillis();
var importedRedactionFilteredEntries = importedRedactionService.processImportedRedactions(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId(), analyzeRequest.getFileId(), newRedactionLogEntries, false);
log.info("Process imports time: {} ms", (System.currentTimeMillis() - prs));
redactionLog.getRedactionLogEntry()
.removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.getType()
.equals(IMPORTED_REDACTION_TYPE));
redactionLog.getRedactionLogEntry().addAll(importedRedactionFilteredEntries);
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
var fls = System.currentTimeMillis();
var x = finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
log.info("Finalize time: {} ms", (System.currentTimeMillis() - fls));
return x;
}
@ -219,13 +218,19 @@ public class AnalyzeService {
}
}
long ss = System.currentTimeMillis();
var dictionaryIncrementsSearch = new SearchImplementation(dictionaryIncrement.getValues().stream()
.map(DictionaryIncrementValue::getValue).collect(Collectors.toList()), true);
for (SectionText sectionText : text.getSectionTexts()) {
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrementsSearch)) {
sectionsToReanalyse.add(sectionText.getSectionNumber());
}
}
log.info("Section Find time: {}", (System.currentTimeMillis() - ss));
log.info("Should reanalyze {} sections for request: {}, took: {}", sectionsToReanalyse.size(), analyzeRequest, System.currentTimeMillis() - start);

View File

@ -218,9 +218,9 @@ public class EntityRedactionService {
String lowercaseInputString = searchableString.toLowerCase();
for (DictionaryModel model : dictionary.getDictionaryModels()) {
var trie = local ? model.getLocalEntriesTrie() : model.getEntriesTrie();
var searchImplementation = local ? model.getLocalSearch() : model.getEntriesSearch();
var entities = EntitySearchUtils.findEntities(model.isCaseInsensitive() ? lowercaseInputString : searchableString,
trie, model, new FindEntityDetails(model.getType(),headline, sectionNumber, !local, model.isDossierDictionary(), local ? Engine.RULE : Engine.DICTIONARY, local? EntityType.RECOMMENDATION: EntityType.ENTITY));
searchImplementation, model, new FindEntityDetails(model.getType(),headline, sectionNumber, !local, model.isDossierDictionary(), local ? Engine.RULE : Engine.DICTIONARY, local? EntityType.RECOMMENDATION: EntityType.ENTITY));
EntitySearchUtils.addOrAddEngine(found, entities);
}

View File

@ -1,14 +1,5 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
import org.apache.commons.lang3.tuple.Pair;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.Rectangle;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualRedactionEntry;
@ -23,10 +14,17 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.tuple.Pair;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
@Slf4j
@Service
@ -90,10 +88,10 @@ public class ManualRedactionSurroundingTextService {
}
private Pair<String, String> findSurroundingText(SectionText sectionText, String value,
List<Rectangle> toFindPositions) {
private Pair<String, String> findSurroundingText(SectionText sectionText, String value, List<Rectangle> toFindPositions) {
Set<Entity> entities = EntitySearchUtils.find(sectionText.getText(), value,new FindEntityDetails( "dummy", sectionText.getHeadline(), sectionText.getSectionNumber(), false, false, Engine.DICTIONARY, EntityType.ENTITY));
Set<Entity> entities = EntitySearchUtils.find(sectionText.getText(), new SearchImplementation(value, false),
new FindEntityDetails("dummy", sectionText.getHeadline(), sectionText.getSectionNumber(), false, false, Engine.DICTIONARY, EntityType.ENTITY));
Set<Entity> entitiesWithPositions = EntitySearchUtils.clearAndFindPositions(entities, sectionText.getSearchableText(), null);
Entity correctEntity = getEntityOnCorrectPosition(entitiesWithPositions, toFindPositions);

View File

@ -43,8 +43,6 @@ public class RedactionLogMergeService {
throw new NotFoundException("RedactionLog not present");
}
log.info("Loaded redaction log with computationalVersion: {}", redactionLog.getAnalysisVersion());
SectionGrid sectionGrid = redactionStorageService.getSectionGrid(redactionRequest.getDossierId(), redactionRequest.getFileId());
if (sectionGrid.getSections().isEmpty()) {
@ -62,7 +60,7 @@ public class RedactionLogMergeService {
redactionStorageService.storeObject(redactionRequest.getDossierId(), redactionRequest.getFileId(), FileType.SECTION_GRID, sectionGrid);
}
log.info("Loaded redaction log with computationalVersion: {}", redactionLog.getAnalysisVersion());
log.debug("Loaded redaction log with computationalVersion: {}", redactionLog.getAnalysisVersion());
var merged = mergeRedactionLogData(redactionLog, sectionGrid, redactionRequest.getManualRedactions(), redactionRequest.getExcludedPages(), redactionRequest.getTypes(), redactionRequest.getColors());
merged.getRedactionLogEntry().removeIf(e -> e.isFalsePositive() && !redactionRequest.isIncludeFalsePositives());

View File

@ -16,34 +16,20 @@ import java.util.stream.Collectors;
public class EntitySearchUtils {
public boolean sectionContainsAny(String sectionText, Set<DictionaryIncrementValue> values) {
var trie = Trie.builder().ignoreCase().addKeywords(values.stream().map(DictionaryIncrementValue::getValue).collect(Collectors.toList())).build();
return trie.containsMatch(sectionText.toLowerCase(Locale.ROOT));
public boolean sectionContainsAny(String sectionText, SearchImplementation searchImplementation) {
return searchImplementation.atLeastOneMatches(sectionText);
}
public Set<Entity> findEntities(String inputString, List<String> values, DictionaryModel type, FindEntityDetails details) {
public Set<Entity> findEntities(String inputString, SearchImplementation searchImplementation, DictionaryModel type, FindEntityDetails details) {
var builder = Trie.builder()
.addKeywords(values);
if (type.isCaseInsensitive()) {
builder.ignoreCase();
}
return findEntities(inputString, builder.build(), type, details);
}
public Set<Entity> findEntities(String inputString, Trie trie, DictionaryModel type, FindEntityDetails details) {
Set<Entity> found = find(inputString, trie, details);
Set<Entity> found = find(inputString, searchImplementation, details);
if (details.getEntityType() == EntityType.RECOMMENDATION) {
Set<Entity> falseRecommendations = find(inputString, type.getFalseRecommendationsTrie(), details.withEntityType(EntityType.FALSE_RECOMMENDATION));
Set<Entity> falseRecommendations = find(inputString, type.getFalseRecommendationsSearch(), details.withEntityType(EntityType.FALSE_RECOMMENDATION));
removeFalsePositives(found, falseRecommendations);
found.addAll(falseRecommendations);
} else {
Set<Entity> falsePositives = find(inputString, type.getFalsePositivesTrie(), details.withEntityType(EntityType.FALSE_POSITIVE));
Set<Entity> falsePositives = find(inputString, type.getFalsePositiveSearch(), details.withEntityType(EntityType.FALSE_POSITIVE));
removeFalsePositives(found, falsePositives);
found.addAll(falsePositives);
}
@ -51,33 +37,10 @@ public class EntitySearchUtils {
return found;
}
public Set<Entity> find(String inputString, String value, FindEntityDetails findEntityDetails) {
var trie = Trie.builder()
.addKeywords(value).build();
public Set<Entity> find(String inputString, SearchImplementation searchImplementation, FindEntityDetails findEntityDetails) {
Set<Entity> entities = new HashSet<>();
trie.parseText(inputString).forEach(found -> {
var startIndex = found.getStart();
var stopIndex = found.getEnd() + 1;
validateAndAddEntity(entities, findEntityDetails, inputString, startIndex, stopIndex);
});
return entities;
}
public Set<Entity> find(String inputString, Trie trie, FindEntityDetails findEntityDetails) {
Set<Entity> entities = new HashSet<>();
var matches = trie.parseText(inputString);
matches.forEach(match -> {
var startIndex = match.getStart();
var stopIndex = match.getEnd() + 1;
validateAndAddEntity(entities, findEntityDetails, inputString, startIndex, stopIndex);
});
searchImplementation.getMatches(inputString).forEach(match -> validateAndAddEntity(entities, findEntityDetails, inputString, match.getStartIndex(), match.getEndIndex()));
return entities;
}

View File

@ -0,0 +1,92 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import lombok.AllArgsConstructor;
import lombok.Data;
import org.ahocorasick.trie.Trie;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Data
public class SearchImplementation {
private boolean ignoreCase;
private List<String> values;
private Pattern pattern;
private Trie trie;
public SearchImplementation(String value, boolean ignoreCase) {
this.values = List.of(value);
this.ignoreCase = ignoreCase;
this.createSearchImplementation();
}
public SearchImplementation(Collection<String> values, boolean ignoreCase) {
this.values = new ArrayList<>(values);
this.ignoreCase = ignoreCase;
this.createSearchImplementation();
}
private void createSearchImplementation() {
if (this.values.isEmpty()) {
return;
}
if (this.values.size() == 1) {
var text = this.values.iterator().next();
this.pattern = Pattern.compile(Pattern.quote(ignoreCase ? text.toLowerCase(Locale.ROOT) : text));
} else {
var builder = Trie.builder();
if (this.ignoreCase) {
builder.ignoreCase();
}
builder.addKeywords(this.values);
this.trie = builder.build();
}
}
public boolean atLeastOneMatches(String text) {
String textToCheck = text;
if (this.values.isEmpty()) {
return false;
}
if (this.pattern != null) {
if (ignoreCase) {
textToCheck = textToCheck.toLowerCase(Locale.ROOT);
}
return this.pattern.matcher(textToCheck).results().findAny().isPresent();
} else {
return this.trie.containsMatch(textToCheck);
}
}
public List<MatchPosition> getMatches(String text) {
String textToCheck = text;
if (this.values.isEmpty()) {
return new ArrayList<>();
}
if (this.pattern != null) {
if (ignoreCase) {
textToCheck = textToCheck.toLowerCase(Locale.ROOT);
}
return this.pattern.matcher(textToCheck).results().map(r -> new MatchPosition(r.start(), r.end())).collect(Collectors.toList());
} else {
return this.trie.parseText(textToCheck).stream().map(r -> new MatchPosition(r.getStart(), r.getEnd() + 1)).collect(Collectors.toList());
}
}
@Data
@AllArgsConstructor
public static class MatchPosition {
private int startIndex;
private int endIndex;
}
}

View File

@ -5,21 +5,21 @@ import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.MessageType;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.junit.Ignore;
import org.junit.Test;
import org.springframework.core.io.ClassPathResource;
import java.io.FileOutputStream;
import java.time.OffsetDateTime;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@Slf4j
public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest {
@Test
@SneakyThrows
public void testFile() {
@ -28,16 +28,17 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
om.registerModule(new JavaTimeModule());
var file = new ClassPathResource(BASE_DIR + "data/test-file.pdf").getInputStream();
var nerData = new ClassPathResource(BASE_DIR + "data/test-file.ner.json").getInputStream();
var text = new ClassPathResource(BASE_DIR + "data/test-file.text.json").getInputStream();
var sectionText = new ClassPathResource(BASE_DIR + "data/test-file.section-grid.json").getInputStream();
var redactionLog = new ClassPathResource(BASE_DIR + "data/test-file.redaction-log.json").getInputStream();
redactionStorageService.storeObject("dossierId", "fileId", FileType.ORIGIN, file);
redactionStorageService.storeObject("dossierId", "fileId", FileType.NER_ENTITIES, nerData);
redactionStorageService.storeObject("dossierId", "fileId", FileType.TEXT, text);
redactionStorageService.storeObject("dossierId", "fileId", FileType.SECTION_GRID, sectionText);
redactionStorageService.storeObject("dossierId", "fileId", FileType.REDACTION_LOG, redactionLog);
try {
var nerData = new ClassPathResource(BASE_DIR + "data/test-file.ner.json").getInputStream();
redactionStorageService.storeObject("dossierId", "fileId", FileType.NER_ENTITIES, nerData);
} catch (Exception e) {
log.warn("No NER File Provided");
redactionServiceSettings.setNerServiceEnabled(false);
}
AnalyzeRequest ar = AnalyzeRequest.builder()
.fileId("fileId")
@ -47,49 +48,38 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
.lastProcessed(OffsetDateTime.now())
.excludedPages(Set.of())
.fileAttributes(List.of())
.messageType(MessageType.STRUCTURE_ANALYSE)
.build();
String in = om.writeValueAsString(ar);
// redactionMessageReceiver.receiveAnalyzeRequest(in, false);
// log.warn("done structure");
var txt = redactionStorageService.getText("dossierId", "fileId");
try {
var text = new ClassPathResource(BASE_DIR + "data/test-file.text.json").getInputStream();
var sectionText = new ClassPathResource(BASE_DIR + "data/test-file.section-grid.json").getInputStream();
redactionStorageService.storeObject("dossierId", "fileId", FileType.TEXT, text);
redactionStorageService.storeObject("dossierId", "fileId", FileType.SECTION_GRID, sectionText);
} catch (Exception e) {
log.info("No text file provided, Performing Structure analysis");
var totalText = txt.getSectionTexts().stream().map(SectionText::getText).collect(Collectors.joining("\n"));
System.out.println(totalText.length());
ar.setMessageType(MessageType.STRUCTURE_ANALYSE);
String in = om.writeValueAsString(ar);
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
}
try {
var redactionLog = new ClassPathResource(BASE_DIR + "data/test-file.redaction-log.json").getInputStream();
} catch (Exception e) {
log.info("No redaction log provided, Performing full analysis");
ar.setMessageType(MessageType.ANALYSE);
String in = om.writeValueAsString(ar);
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
}
// ar.setMessageType(MessageType.ANALYSE);
// in = om.writeValueAsString(ar);
// redactionMessageReceiver.receiveAnalyzeRequest(in, false);
// log.warn("done analyze");
simulateIncrement(List.of("study"),"PII",3L);
simulateIncrement(List.of("type"), "PII", 3L);
ar.setMessageType(MessageType.REANALYSE);
in = om.writeValueAsString(ar);
String in = om.writeValueAsString(ar);
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
log.warn("done analyze");
var log = redactionStorageService.getRedactionLog("dossierId", "fileId");
om.writeValue(new FileOutputStream("/tmp/test-file.redaction-log.json"), log);
System.out.println(log.getRedactionLogEntry().size());
}
// public static long hash(char[]){
// return ((value % prime) + prime) % prime;
// }
// public static long getBiggerPrime() {
// BigInteger prime = BigInteger.probablePrime(getNumberOfBits(10) + 1, new Random());
// return prime.longValue();
// }
// private static int getNumberOfBits(int number) {
// return Integer.SIZE - Integer.numberOfLeadingZeros(number);
// }
}

View File

@ -12,6 +12,7 @@ import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
import com.iqser.red.service.redaction.v1.server.client.*;
import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
@ -81,6 +82,9 @@ public class LiveDataIntegrationTest {
@Autowired
protected FileSystemBackedStorageService fileSystemBackedStorageService;
@Autowired
protected RedactionServiceSettings redactionServiceSettings;
private List<Type> types;
@Configuration