Pull request #219: RED-1970: Call entity-redaction-service once per document

Merge in RED/redaction-service from RED-1970-2 to master

* commit 'f1b3d129ee32d40a78fad038a69027a24d8ccdd8':
  RED-1970: Call entity-redaction-service once per document
This commit is contained in:
Dominique Eiflaender 2021-09-07 09:46:41 +02:00
commit a388a28cec
9 changed files with 111 additions and 36 deletions

View File

@ -24,7 +24,7 @@
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>file-management-service-api-v1</artifactId>
<version>2.25.0</version>
<version>2.96.0</version>
<exclusions>
<exclusion>
<groupId>com.iqser.red.service</groupId>

View File

@ -9,10 +9,11 @@ import org.springframework.web.bind.annotation.PostMapping;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecogintionEntity;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
@FeignClient(name = "EntityRecognitionClient", url = "${entity-recognition-service.url}")
public interface EntityRecognitionClient {
@PostMapping(value = "/find_authors", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
Map<String, Map<String, List<EntityRecogintionEntity>>> findAuthors(EntityRecognitionRequest entityRecognitionRequest);
NerEntities findAuthors(EntityRecognitionRequest entityRecognitionRequest);
}

View File

@ -13,9 +13,9 @@ import lombok.NoArgsConstructor;
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class EntityRecognitionResponse {
public class NerEntities {
@Builder.Default
private Map<String, List<EntityRecogintionEntity>> result = new HashMap<>();
private Map<Integer, List<EntityRecogintionEntity>> result = new HashMap<>();
}

View File

@ -7,6 +7,7 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest;
import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.service.NerAnalyserService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -26,6 +27,7 @@ public class RedactionMessageReceiver {
private final ObjectMapper objectMapper;
private final AnalyzeService analyzeService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
private final NerAnalyserService nerAnalyserService;
@RabbitHandler
@ -43,6 +45,9 @@ public class RedactionMessageReceiver {
// TODO Seperate stucture analysis by other queue
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(analyzeRequest.getDossierId(), analyzeRequest.getFileId()));
// TODO NerEntities should be computed and stored in entity-recognition-service, should be triggered by a seperate queue after structure analysis
nerAnalyserService.computeNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
result = analyzeService.analyze(analyzeRequest);
log.info("Successfully analyzed dossier {} file {} took: {}", analyzeRequest.getDossierId(), analyzeRequest.getFileId(), result
.getDuration());

View File

@ -60,6 +60,7 @@ public class AnalyzeService {
private final RedactionServiceSettings redactionServiceSettings;
private final SectionTextBuilderService sectionTextBuilderService;
private final SectionGridCreatorService sectionGridCreatorService;
private final NerAnalyserService nerAnalyserService;
public void analyzeDocumentStructure(StructureAnalyzeRequest analyzeRequest) {
@ -81,7 +82,8 @@ public class AnalyzeService {
List<SectionText> sectionTexts = sectionTextBuilderService.buildSectionText(classifiedDoc);
sectionGridCreatorService.createSectionGrid(classifiedDoc, pageCount);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.TEXT, new Text(pageCount, sectionTexts));
Text text = new Text(pageCount, sectionTexts);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.TEXT, text);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc
.getSectionGrid());
@ -94,6 +96,11 @@ public class AnalyzeService {
long startTime = System.currentTimeMillis();
var text = redactionStorageService.getText(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
var nerEntities = redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
if(redactionServiceSettings.isEnableEntityRecognition() && nerEntities == null){
nerAnalyserService.computeNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
nerEntities = redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
}
dictionaryService.updateDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId());
@ -101,7 +108,7 @@ public class AnalyzeService {
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest
.getDossierId());
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, text.getSectionTexts(), kieContainer, analyzeRequest);
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, text.getSectionTexts(), kieContainer, analyzeRequest, nerEntities);
dictionaryService.updateExternalDictionary(dictionary, analyzeRequest.getDossierTemplateId());
@ -141,6 +148,12 @@ public class AnalyzeService {
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
}
var nerEntities = redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
if(redactionServiceSettings.isEnableEntityRecognition() && nerEntities == null){
nerAnalyserService.computeNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
nerEntities = redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
}
List<SectionText> reanalysisSections = text.getSectionTexts()
.stream()
.filter(sectionText -> sectionsToReanalyse.contains(sectionText.getSectionNumber()))
@ -151,7 +164,7 @@ public class AnalyzeService {
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest
.getDossierId());
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, reanalysisSections, kieContainer, analyzeRequest);
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, reanalysisSections, kieContainer, analyzeRequest, nerEntities);
var newRedactionLogEntries = redactionLogCreatorService.createRedactionLog(pageEntities, text.getNumberOfPages(), analyzeRequest
.getDossierTemplateId());

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@ -18,9 +19,8 @@ import com.iqser.red.service.redaction.v1.model.ManualImageRecategorization;
import com.iqser.red.service.redaction.v1.model.Status;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.EntityRecognitionClient;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecogintionEntity;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionResponse;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionSection;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
@ -50,14 +50,14 @@ public class EntityRedactionService {
public PageEntities findEntities(Dictionary dictionary, List<SectionText> sectionTexts, KieContainer kieContainer,
AnalyzeRequest analyzeRequest) {
AnalyzeRequest analyzeRequest, NerEntities nerEntities) {
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
Set<Entity> entities = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, false, null, imagesPerPage);
Set<Entity> entities = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, false, null, imagesPerPage, nerEntities);
if (dictionary.hasLocalEntries()) {
Map<Integer, Set<Entity>> hintsPerSectionNumber = getHintsPerSection(entities, dictionary);
Set<Entity> foundByLocal = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, true, hintsPerSectionNumber, imagesPerPage);
Set<Entity> foundByLocal = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, true, hintsPerSectionNumber, imagesPerPage, nerEntities);
EntitySearchUtils.addEntitiesWithHigherRank(entities, foundByLocal, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
@ -70,13 +70,13 @@ public class EntityRedactionService {
public Set<Entity> findEntities(List<SectionText> reanalysisSections, Dictionary dictionary,
KieContainer kieContainer, AnalyzeRequest analyzeRequest, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
Map<Integer, Set<Image>> imagesPerPage) {
Map<Integer, Set<Image>> imagesPerPage, NerEntities nerEntities) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection
.getSectionNumber(), dictionary, local);
.getSectionNumber(), dictionary, local, nerEntities);
if (reanalysisSection.getCellStarts() != null && !reanalysisSection.getCellStarts().isEmpty()) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
@ -124,7 +124,7 @@ public class EntityRedactionService {
EntitySearchUtils.removeEntitiesContainedInLarger(analysedSection.getEntities());
entities.addAll(analysedSection.getEntities());
if(!local) {
if (!local) {
for (Image image : analysedSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
@ -198,7 +198,7 @@ public class EntityRedactionService {
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
Dictionary dictionary, boolean local) {
Dictionary dictionary, boolean local, NerEntities nerEntities) {
Set<Entity> found = new HashSet<>();
String searchableString = searchableText.toString();
@ -217,35 +217,22 @@ public class EntityRedactionService {
}
}
if (redactionServiceSettings.isEnableEntityRecognition() && !local) {
found.addAll(getAiEntities(sectionNumber, searchableString, headline));
if (!local) {
addNerEntities(found, sectionNumber, headline, nerEntities);
}
return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary);
}
private Set<Entity> getAiEntities(int sectionNumber, String searchableString, String headline) {
private void addNerEntities(Set<Entity> found, int sectionNumber, String headline, NerEntities nerEntities) {
Set<Entity> found = new HashSet<>();
Map<String, Map<String, List<EntityRecogintionEntity>>> response = entityRecognitionClient.findAuthors(EntityRecognitionRequest
.builder()
.data(List.of(EntityRecognitionSection.builder()
.sectionNumber(sectionNumber)
.text(searchableString)
.build()))
.build());
EntityRecognitionResponse entityRecognitionResponse = new EntityRecognitionResponse(response.get("result:"));
if (entityRecognitionResponse.getResult() != null && entityRecognitionResponse.getResult()
.containsKey(String.valueOf(sectionNumber))) {
entityRecognitionResponse.getResult().get(String.valueOf(sectionNumber)).forEach(res -> {
if (redactionServiceSettings.isEnableEntityRecognition() && nerEntities.getResult()
.containsKey(sectionNumber)) {
nerEntities.getResult().get(sectionNumber).forEach(res -> {
found.add(new Entity(res.getValue(), res.getType(), res.getStartOffset(), res.getEndOffset(), headline, sectionNumber, false, false));
});
}
return found;
}
}

View File

@ -0,0 +1,49 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.server.client.EntityRecognitionClient;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionSection;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class NerAnalyserService {
private final RedactionStorageService redactionStorageService;
private final EntityRecognitionClient entityRecognitionClient;
private final RedactionServiceSettings redactionServiceSettings;
public void computeNerEntities(String dossierId, String fileId) {
if (redactionServiceSettings.isEnableEntityRecognition()) {
var text = redactionStorageService.getText(dossierId, fileId);
long start = System.currentTimeMillis();
var nerRequest = EntityRecognitionRequest.builder()
.data(text.getSectionTexts()
.stream()
.map(sectionText -> new EntityRecognitionSection(sectionText.getSectionNumber(), sectionText
.getText()))
.collect(Collectors.toList()))
.build();
var nerResponse = entityRecognitionClient.findAuthors(nerRequest);
log.info("Computing NER entities took: {} ms for dossierId {} and fileId {}", System.currentTimeMillis() - start, dossierId, fileId);
redactionStorageService.storeObject(dossierId, fileId, FileType.NER_ENTITIES, nerResponse);
}
}
}

View File

@ -5,6 +5,7 @@ import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.Getter;
@ -73,6 +74,25 @@ public class RedactionStorageService {
}
public NerEntities getNerEntities(String dossierId, String fileId) {
InputStreamResource inputStreamResource;
try {
inputStreamResource = storageService.getObject(StorageIdUtils.getStorageId(dossierId, fileId, FileType.NER_ENTITIES));
} catch (StorageObjectDoesNotExist e) {
log.debug("NER Entities not available.");
return null;
}
try {
return objectMapper.readValue(inputStreamResource.getInputStream(), NerEntities.class);
} catch (IOException e) {
throw new RuntimeException("Could not convert NerEntities", e);
}
}
public SectionGrid getSectionGrid(String dossierId, String fileId) {
var sectionGrid = storageService.getObject(StorageIdUtils.getStorageId(dossierId, fileId, FileType.SECTION_GRID));

View File

@ -1,7 +1,7 @@
configuration-service.url: "http://configuration-service-v1:8080"
image-service.url: "http://image-service-v1:8080"
file-management-service.url: "http://file-management-service-v1:8080"
entity-recognition-service.url: "http://entity-recognition-service-v1:8080"
entity-recognition-service.url: "localhost:8080"
ribbon:
ConnectTimeout: 600000