diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnalyzeService.java index f8cfe19a..8ea760c9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnalyzeService.java @@ -50,6 +50,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Image; import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities; import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D; +import com.iqser.red.service.redaction.v1.server.redaction.service.entityredaction.EntityRedactionService; import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation; import com.iqser.red.service.redaction.v1.server.segmentation.ImageService; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/entityredaction/EntityFinder.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/entityredaction/EntityFinder.java new file mode 100644 index 00000000..869aff6d --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/entityredaction/EntityFinder.java @@ -0,0 +1,127 @@ +package com.iqser.red.service.redaction.v1.server.redaction.service.entityredaction; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; +import org.springframework.stereotype.Component; + +import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions; +import com.iqser.red.service.redaction.v1.model.Engine; +import com.iqser.red.service.redaction.v1.server.client.model.NerEntities; +import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; +import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entities; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType; +import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; +import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; +import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails; +import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; + +import io.micrometer.core.annotation.Timed; +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Component +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +class EntityFinder { + + RedactionServiceSettings redactionServiceSettings; + + + @Timed("redactmanager_findEntities") + public Entities findEntities(SearchableText searchableText, + String headline, + int sectionNumber, + Dictionary dictionary, + boolean local, + NerEntities nerEntities, + List cellStarts, + ManualRedactions manualRedactions) { + + Set found = new HashSet<>(); + String searchableString = searchableText.asString(); + + if (StringUtils.isEmpty(searchableString)) { + return new Entities(new HashSet<>(), new HashSet<>()); + } + + String lowercaseInputString = searchableString.toLowerCase(); + for (DictionaryModel model : dictionary.getDictionaryModels()) { + + var searchImplementation = local ? model.getLocalSearch() : model.getEntriesSearch(); + var entities = EntitySearchUtils.findEntities(model.isCaseInsensitive() ? lowercaseInputString : searchableString, + searchImplementation, + model, + new FindEntityDetails(model.getType(), + headline, + sectionNumber, + !local, + model.isDossierDictionary(), + local ? Engine.RULE : Engine.DICTIONARY, + local ? EntityType.RECOMMENDATION : EntityType.ENTITY)); + + EntitySearchUtils.addOrAddEngine(found, entities); + } + + Set nerFound = new HashSet<>(); + if (!local) { + nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellStarts, headline)); + } + + var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions); + return new Entities(cleared.stream().filter(e -> !e.isFalsePositive()).collect(Collectors.toSet()), nerFound); + } + + + private Set getNerValues(int sectionNumber, NerEntities nerEntities, List cellStarts, String headline) { + + Set entities = new HashSet<>(); + + if (redactionServiceSettings.isNerServiceEnabled() && nerEntities.getData().containsKey(sectionNumber)) { + nerEntities.getData().get(sectionNumber).forEach(res -> { + if (cellStarts == null || cellStarts.isEmpty()) { + entities.add(new Entity(res.getValue(), + res.getType(), + res.getStartOffset(), + res.getEndOffset(), + headline, + sectionNumber, + false, + false, + Engine.NER, + EntityType.RECOMMENDATION)); + } else { + boolean intersectsCellStart = false; + for (Integer cellStart : cellStarts) { + if (res.getStartOffset() < cellStart && cellStart < res.getEndOffset()) { + intersectsCellStart = true; + break; + } + } + if (!intersectsCellStart) { + entities.add(new Entity(res.getValue(), + res.getType(), + res.getStartOffset(), + res.getEndOffset(), + headline, + sectionNumber, + false, + false, + Engine.NER, + EntityType.RECOMMENDATION)); + } + } + }); + } + return entities; + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/entityredaction/EntityRedactionService.java similarity index 62% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/entityredaction/EntityRedactionService.java index 55a6f1a2..e169909b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/entityredaction/EntityRedactionService.java @@ -1,41 +1,52 @@ -package com.iqser.red.service.redaction.v1.server.redaction.service; +package com.iqser.red.service.redaction.v1.server.redaction.service.entityredaction; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.kie.api.runtime.KieContainer; +import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus; -import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions; import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.IdRemoval; import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualImageRecategorization; import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; -import com.iqser.red.service.redaction.v1.model.Engine; import com.iqser.red.service.redaction.v1.model.FileAttribute; import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; import com.iqser.red.service.redaction.v1.server.client.model.NerEntities; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; -import com.iqser.red.service.redaction.v1.server.redaction.model.*; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entities; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.FindEntitiesResult; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; +import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities; +import com.iqser.red.service.redaction.v1.server.redaction.model.Section; +import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair; +import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService; +import com.iqser.red.service.redaction.v1.server.redaction.service.SurroundingWordsService; import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; -import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails; import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; -import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; -import io.micrometer.core.annotation.Timed; +import lombok.AccessLevel; import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.lang3.StringUtils; -import org.kie.api.runtime.KieContainer; -import org.springframework.stereotype.Service; - -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; - @Slf4j @Service @RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class EntityRedactionService { - private final RedactionServiceSettings redactionServiceSettings; - private final DroolsExecutionService droolsExecutionService; - private final SurroundingWordsService surroundingWordsService; + DroolsExecutionService droolsExecutionService; + SurroundingWordsService surroundingWordsService; + EntityFinder entityFinder; public PageEntities findEntities(Dictionary dictionary, List sectionTexts, KieContainer kieContainer, AnalyzeRequest analyzeRequest, NerEntities nerEntities) { @@ -45,7 +56,7 @@ public class EntityRedactionService { if (dictionary.hasLocalEntries() || !findEntitiesResult.getAddedFileAttributes().isEmpty()) { - if(!findEntitiesResult.getAddedFileAttributes().isEmpty()) { + if (!findEntitiesResult.getAddedFileAttributes().isEmpty()) { //AnalyzeRequest provides immutable list. List mergedFileAttributes = new ArrayList<>(); mergedFileAttributes.addAll(analyzeRequest.getFileAttributes()); @@ -54,7 +65,14 @@ public class EntityRedactionService { } Map> hintsPerSectionNumber = getHintsPerSection(findEntitiesResult.getEntities(), dictionary); - FindEntitiesResult foundByLocalEntitiesResult = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, true, hintsPerSectionNumber, imagesPerPage, nerEntities); + FindEntitiesResult foundByLocalEntitiesResult = findEntities(sectionTexts, + dictionary, + kieContainer, + analyzeRequest, + true, + hintsPerSectionNumber, + imagesPerPage, + nerEntities); EntitySearchUtils.addEntitiesWithHigherRank(findEntitiesResult.getEntities(), foundByLocalEntitiesResult.getEntities(), dictionary); EntitySearchUtils.removeEntitiesContainedInLarger(findEntitiesResult.getEntities()); } @@ -67,18 +85,78 @@ public class EntityRedactionService { public FindEntitiesResult findEntities(List reanalysisSections, - Dictionary dictionary, - KieContainer kieContainer, - AnalyzeRequest analyzeRequest, - boolean local, - Map> hintsPerSectionNumber, - Map> imagesPerPage, - NerEntities nerEntities) { + Dictionary dictionary, + KieContainer kieContainer, + AnalyzeRequest analyzeRequest, + boolean local, + Map> hintsPerSectionNumber, + Map> imagesPerPage, + NerEntities nerEntities) { - List sectionSearchableTextPairs = new ArrayList<>(); - for (SectionText reanalysisSection : reanalysisSections) { + List sectionSearchableTextPairs = extractSearchableTextPairs(reanalysisSections, + dictionary, + analyzeRequest, + local, + hintsPerSectionNumber, + nerEntities); - Entities entities = findEntities(reanalysisSection.getSearchableText(), + Set addedFileAttributes = new HashSet<>(); + Set entities = new HashSet<>(); + sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { + + if (!addedFileAttributes.isEmpty()) { + //Section.Builder provides immutable list. + List mergedFileAttributes = new ArrayList<>(); + mergedFileAttributes.addAll(sectionSearchableTextPair.getSection().getAddedFileAttributes()); + mergedFileAttributes.addAll(addedFileAttributes); + sectionSearchableTextPair.getSection().setFileAttributes(mergedFileAttributes); + } + + Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection()); + + addedFileAttributes.addAll(analysedSection.getAddedFileAttributes()); + + EntitySearchUtils.removeEntitiesContainedInLarger(analysedSection.getEntities()); + + var entriesWithoutSurroundingText = analysedSection.getEntities() + .stream() + .filter(e -> e.getTextAfter() == null && e.getTextBefore() == null) + .collect(Collectors.toSet()); + + if (sectionSearchableTextPair.getCellStarts() != null && !sectionSearchableTextPair.getCellStarts().isEmpty()) { + surroundingWordsService.addSurroundingText(entriesWithoutSurroundingText, + sectionSearchableTextPair.getSearchableText(), + dictionary, + sectionSearchableTextPair.getCellStarts()); + } else { + surroundingWordsService.addSurroundingText(entriesWithoutSurroundingText, sectionSearchableTextPair.getSearchableText(), dictionary); + } + + entities.addAll(analysedSection.getEntities()); + + if (!local) { + for (Image image : analysedSection.getImages()) { + imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); + } + addLocalValuesToDictionary(analysedSection, dictionary); + } + + }); + + return FindEntitiesResult.builder().entities(entities).addedFileAttributes(addedFileAttributes).build(); + } + + + private List extractSearchableTextPairs(List reanalysisSections, + Dictionary dictionary, + AnalyzeRequest analyzeRequest, + boolean local, + Map> hintsPerSectionNumber, + NerEntities nerEntities) { + + return reanalysisSections.stream().map(reanalysisSection -> { + + Entities entities = entityFinder.findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, @@ -136,73 +214,36 @@ public class EntityRedactionService { log.debug("Section {}, Images: {}", reanalysisSection.getSectionNumber(), reanalysisSection.getImages()); - sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() - .isLocal(false) - .dictionaryTypes(dictionary.getTypes()) - .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream.concat(entities.getEntities() - .stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber()).stream()).collect(Collectors.toSet()) : entities.getEntities()) - .nerEntities(entities.getNerEntities()) - .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) - .searchText(reanalysisSection.getSearchableText().toString()) - .headline(reanalysisSection.getHeadline()) - .sectionNumber(reanalysisSection.getSectionNumber()) - .tabularData(reanalysisSection.getTabularData()) - .searchableText(reanalysisSection.getSearchableText()) - .dictionary(dictionary) - .images(reanalysisSection.getImages()) - .sectionAreas(reanalysisSection.getSectionAreas()) - .fileAttributes(analyzeRequest.getFileAttributes()) - .manualRedactions(analyzeRequest.getManualRedactions()) - .isInTable(reanalysisSection.isTable()) - .build(), reanalysisSection.getSearchableText(), reanalysisSection.getCellStarts())); - - } + return toSectionSearchableTextPair(dictionary, analyzeRequest, hintsPerSectionNumber, reanalysisSection, entities); + }).collect(Collectors.toList()); + } - Set addedFileAttributes = new HashSet<>(); - Set entities = new HashSet<>(); - sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { + private SectionSearchableTextPair toSectionSearchableTextPair(Dictionary dictionary, + AnalyzeRequest analyzeRequest, + Map> hintsPerSectionNumber, + SectionText reanalysisSection, + Entities entities) { - if(!addedFileAttributes.isEmpty()) { - //Section.Builder provides immutable list. - List mergedFileAttributes = new ArrayList<>(); - mergedFileAttributes.addAll(sectionSearchableTextPair.getSection().getAddedFileAttributes()); - mergedFileAttributes.addAll(addedFileAttributes); - sectionSearchableTextPair.getSection().setFileAttributes(mergedFileAttributes); - } - - Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection()); - - addedFileAttributes.addAll(analysedSection.getAddedFileAttributes()); - - EntitySearchUtils.removeEntitiesContainedInLarger(analysedSection.getEntities()); - - var entriesWithoutSurroundingText = analysedSection.getEntities() - .stream() - .filter(e -> e.getTextAfter() == null && e.getTextBefore() == null) - .collect(Collectors.toSet()); - - if (sectionSearchableTextPair.getCellStarts() != null && !sectionSearchableTextPair.getCellStarts().isEmpty()) { - surroundingWordsService.addSurroundingText(entriesWithoutSurroundingText, - sectionSearchableTextPair.getSearchableText(), - dictionary, - sectionSearchableTextPair.getCellStarts()); - } else { - surroundingWordsService.addSurroundingText(entriesWithoutSurroundingText, sectionSearchableTextPair.getSearchableText(), dictionary); - } - - entities.addAll(analysedSection.getEntities()); - - if (!local) { - for (Image image : analysedSection.getImages()) { - imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); - } - addLocalValuesToDictionary(analysedSection, dictionary); - } - - }); - - return FindEntitiesResult.builder().entities(entities).addedFileAttributes(addedFileAttributes).build(); + return new SectionSearchableTextPair(Section.builder() + .isLocal(false) + .dictionaryTypes(dictionary.getTypes()) + .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream.concat(entities.getEntities().stream(), + hintsPerSectionNumber.get(reanalysisSection.getSectionNumber()).stream()).collect(Collectors.toSet()) : entities.getEntities()) + .nerEntities(entities.getNerEntities()) + .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) + .searchText(reanalysisSection.getSearchableText().toString()) + .headline(reanalysisSection.getHeadline()) + .sectionNumber(reanalysisSection.getSectionNumber()) + .tabularData(reanalysisSection.getTabularData()) + .searchableText(reanalysisSection.getSearchableText()) + .dictionary(dictionary) + .images(reanalysisSection.getImages()) + .sectionAreas(reanalysisSection.getSectionAreas()) + .fileAttributes(analyzeRequest.getFileAttributes()) + .manualRedactions(analyzeRequest.getManualRedactions()) + .isInTable(reanalysisSection.isTable()) + .build(), reanalysisSection.getSearchableText(), reanalysisSection.getCellStarts()); } @@ -244,7 +285,7 @@ public class EntityRedactionService { private Map> getHintsPerSection(Set entities, Dictionary dictionary) { Map> hintsPerSectionNumber = new HashMap<>(); - entities.stream().forEach(entity -> { + entities.forEach(entity -> { if (dictionary.isHint(entity.getType()) && entity.isDictionaryEntry()) { hintsPerSectionNumber.computeIfAbsent(entity.getSectionNumber(), (x) -> new HashSet<>()).add(entity); } @@ -269,93 +310,4 @@ public class EntityRedactionService { })); } - - @Timed("redactmanager_findEntities") - private Entities findEntities(SearchableText searchableText, - String headline, - int sectionNumber, - Dictionary dictionary, - boolean local, - NerEntities nerEntities, - List cellStarts, - ManualRedactions manualRedactions) { - - Set found = new HashSet<>(); - String searchableString = searchableText.asString(); - - if (StringUtils.isEmpty(searchableString)) { - return new Entities(new HashSet<>(), new HashSet<>()); - } - - String lowercaseInputString = searchableString.toLowerCase(); - for (DictionaryModel model : dictionary.getDictionaryModels()) { - - var searchImplementation = local ? model.getLocalSearch() : model.getEntriesSearch(); - var entities = EntitySearchUtils.findEntities(model.isCaseInsensitive() ? lowercaseInputString : searchableString, - searchImplementation, - model, - new FindEntityDetails(model.getType(), - headline, - sectionNumber, - !local, - model.isDossierDictionary(), - local ? Engine.RULE : Engine.DICTIONARY, - local ? EntityType.RECOMMENDATION : EntityType.ENTITY)); - - EntitySearchUtils.addOrAddEngine(found, entities); - } - - Set nerFound = new HashSet<>(); - if (!local) { - nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellStarts, headline)); - } - - var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions); - return new Entities(cleared.stream().filter(e -> !e.isFalsePositive()).collect(Collectors.toSet()), nerFound); - } - - - private Set getNerValues(int sectionNumber, NerEntities nerEntities, List cellStarts, String headline) { - - Set entities = new HashSet<>(); - - if (redactionServiceSettings.isNerServiceEnabled() && nerEntities.getData().containsKey(sectionNumber)) { - nerEntities.getData().get(sectionNumber).forEach(res -> { - if (cellStarts == null || cellStarts.isEmpty()) { - entities.add(new Entity(res.getValue(), - res.getType(), - res.getStartOffset(), - res.getEndOffset(), - headline, - sectionNumber, - false, - false, - Engine.NER, - EntityType.RECOMMENDATION)); - } else { - boolean intersectsCellStart = false; - for (Integer cellStart : cellStarts) { - if (res.getStartOffset() < cellStart && cellStart < res.getEndOffset()) { - intersectsCellStart = true; - break; - } - } - if (!intersectsCellStart) { - entities.add(new Entity(res.getValue(), - res.getType(), - res.getStartOffset(), - res.getEndOffset(), - headline, - sectionNumber, - false, - false, - Engine.NER, - EntityType.RECOMMENDATION)); - } - } - }); - } - return entities; - } - }