Pull request #218: RED-1970: Seperated structur analysis from entity analysis

Merge in RED/redaction-service from RED-1970 to master

* commit '5939c3d460dde4585280c1372306acfaae89dd61':
  RED-1970: Seperated structur analysis from entity analysis
This commit is contained in:
Dominique Eiflaender 2021-09-03 12:43:42 +02:00
commit 89acbb1001
16 changed files with 816 additions and 1312 deletions

View File

@ -0,0 +1,18 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class StructureAnalyzeRequest {
private String dossierId;
private String fileId;
}

View File

@ -1,19 +1,14 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
@Data
@NoArgsConstructor
public class Document {
@ -23,20 +18,14 @@ public class Document {
private List<Header> headers = new ArrayList<>();
private List<Footer> footers = new ArrayList<>();
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
private Map<Integer, List<Entity>> entities = new HashMap<>();
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private boolean headlines;
private List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
private SectionGrid sectionGrid = new SectionGrid();
private DictionaryVersion dictionaryVersion;
private long rulesVersion;
private List<SectionText> sectionText = new ArrayList<>();
private Map<Integer, Set<Image>> images = new HashMap<>();
}

View File

@ -4,10 +4,13 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest;
import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.stereotype.Service;
@ -21,9 +24,10 @@ import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfigura
public class RedactionMessageReceiver {
private final ObjectMapper objectMapper;
private final ReanalyzeService reanalyzeService;
private final AnalyzeService analyzeService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
@RabbitHandler
@RabbitListener(queues = REDACTION_QUEUE)
public void receiveAnalyzeRequest(String in) throws JsonProcessingException {
@ -32,15 +36,22 @@ public class RedactionMessageReceiver {
log.info("Processing analyze request: {}", analyzeRequest);
AnalyzeResult result;
if (analyzeRequest.isReanalyseOnlyIfPossible()) {
result = reanalyzeService.reanalyze(analyzeRequest);
result = analyzeService.reanalyze(analyzeRequest);
log.info("Successfully reanalyzed dossier {} file {} took: {}", analyzeRequest.getDossierId(), analyzeRequest
.getFileId(), result.getDuration());
} else {
result = reanalyzeService.analyze(analyzeRequest);
// TODO Seperate stucture analysis by other queue
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(analyzeRequest.getDossierId(), analyzeRequest.getFileId()));
result = analyzeService.analyze(analyzeRequest);
log.info("Successfully analyzed dossier {} file {} took: {}", analyzeRequest.getDossierId(), analyzeRequest.getFileId(), result
.getDuration());
}
log.info("Successfully analyzed {}", analyzeRequest);
fileStatusProcessingUpdateClient.analysisSuccessful(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), result);
}
@RabbitHandler
@RabbitListener(queues = REDACTION_DQL)
public void receiveAnalyzeRequestDQL(String in) throws JsonProcessingException {

View File

@ -0,0 +1,23 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@Data
@AllArgsConstructor
public class PageEntities {
@Builder.Default
private Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
@Builder.Default
private Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
}

View File

@ -1,35 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
public class AnalyzeResponseService {
private final RedactionServiceSettings redactionServiceSettings;
public AnalyzeResult createAnalyzeResponse(String dossierId, String fileId, long duration, int pageCount,
RedactionLog redactionLog, boolean hasUpdates) {
return AnalyzeResult.builder()
.dossierId(dossierId)
.fileId(fileId)
.duration(duration)
.numberOfPages(pageCount)
.hasUpdates(hasUpdates)
.analysisVersion(redactionServiceSettings.getAnalysisVersion())
.rulesVersion(redactionLog.getRulesVersion())
.dictionaryVersion(redactionLog.getDictionaryVersion())
.legalBasisVersion(redactionLog.getLegalBasisVersion())
.dossierDictionaryVersion(redactionLog.getDossierDictionaryVersion())
.build();
}
}

View File

@ -0,0 +1,270 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualImageRecategorization;
import com.iqser.red.service.redaction.v1.model.ManualLegalBasisChange;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class AnalyzeService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
private final EntityRedactionService entityRedactionService;
private final RedactionLogCreatorService redactionLogCreatorService;
private final RedactionStorageService redactionStorageService;
private final PdfSegmentationService pdfSegmentationService;
private final RedactionChangeLogService redactionChangeLogService;
private final LegalBasisClient legalBasisClient;
private final RedactionServiceSettings redactionServiceSettings;
private final SectionTextBuilderService sectionTextBuilderService;
private final SectionGridCreatorService sectionGridCreatorService;
public void analyzeDocumentStructure(StructureAnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var pageCount = 0;
Document classifiedDoc;
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest
.getDossierId(), analyzeRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
pageCount = classifiedDoc.getPages().size();
} catch (Exception e) {
throw new RedactionException(e);
}
List<SectionText> sectionTexts = sectionTextBuilderService.buildSectionText(classifiedDoc);
sectionGridCreatorService.createSectionGrid(classifiedDoc, pageCount);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.TEXT, new Text(pageCount, sectionTexts));
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc
.getSectionGrid());
log.info("Document structure analysis successful, took: {}", System.currentTimeMillis() - startTime);
}
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var text = redactionStorageService.getText(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
dictionaryService.updateDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId());
long rulesVersion = droolsExecutionService.getRulesVersion(analyzeRequest.getDossierTemplateId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest
.getDossierId());
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, text.getSectionTexts(), kieContainer, analyzeRequest);
dictionaryService.updateExternalDictionary(dictionary, analyzeRequest.getDossierTemplateId());
List<RedactionLogEntry> redactionLogEntries = redactionLogCreatorService.createRedactionLog(pageEntities, text.getNumberOfPages(), analyzeRequest
.getDossierTemplateId());
var legalBasis = legalBasisClient.getLegalBasisMapping(analyzeRequest.getDossierTemplateId());
var redactionLog = new RedactionLog(redactionServiceSettings.getAnalysisVersion(), redactionLogEntries, legalBasis, dictionary
.getVersion()
.getDossierTemplateVersion(), dictionary.getVersion()
.getDossierVersion(), rulesVersion, legalBasisClient.getVersion(analyzeRequest.getDossierTemplateId()));
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionary.getVersion(), false);
}
@SneakyThrows
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
var text = redactionStorageService.getText(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
// not yet ready for reanalysis
if (redactionLog == null || text == null || text.getNumberOfPages() == 0) {
return analyze(analyzeRequest);
}
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getDossierTemplateId(), new DictionaryVersion(redactionLog
.getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getDossierId());
Set<Integer> sectionsToReanalyse = !analyzeRequest.getSectionsToReanalyse()
.isEmpty() ? analyzeRequest.getSectionsToReanalyse() : findSectionsToReanalyse(dictionaryIncrement, redactionLog, text, analyzeRequest);
if (sectionsToReanalyse.isEmpty()) {
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
}
List<SectionText> reanalysisSections = text.getSectionTexts()
.stream()
.filter(sectionText -> sectionsToReanalyse.contains(sectionText.getSectionNumber()))
.collect(Collectors.toList());
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest
.getDossierId());
PageEntities pageEntities = entityRedactionService.findEntities(dictionary, reanalysisSections, kieContainer, analyzeRequest);
var newRedactionLogEntries = redactionLogCreatorService.createRedactionLog(pageEntities, text.getNumberOfPages(), analyzeRequest
.getDossierTemplateId());
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()));
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement.getDictionaryVersion(), true);
}
private Set<Integer> findSectionsToReanalyse(DictionaryIncrement dictionaryIncrement, RedactionLog redactionLog,
Text text, AnalyzeRequest analyzeRequest) {
long start = System.currentTimeMillis();
Set<String> relevantManuallyModifiedAnnotationIds = getRelevantManuallyModifiedAnnotationIds(analyzeRequest.getManualRedactions());
Set<Integer> sectionsToReanalyse = new HashSet<>();
Map<Integer, Set<Image>> imageEntries = new HashMap<>();
for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) {
if (entry.isManual() || relevantManuallyModifiedAnnotationIds.contains(entry.getId())) {
sectionsToReanalyse.add(entry.getSectionNumber());
}
if (entry.isImage() || entry.getType().equals("image")) {
imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry));
}
}
for (SectionText sectionText : text.getSectionTexts()) {
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
sectionsToReanalyse.add(sectionText.getSectionNumber());
}
}
log.info("Should reanalyze {} sections for request: {}, took: {}", sectionsToReanalyse.size(), analyzeRequest, System.currentTimeMillis() - start);
return sectionsToReanalyse;
}
private AnalyzeResult finalizeAnalysis(@RequestBody AnalyzeRequest analyzeRequest, long startTime,
RedactionLog redactionLog, Text text, DictionaryVersion dictionaryVersion,
boolean isReanalysis) {
redactionLog.setDictionaryVersion(dictionaryVersion.getDossierTemplateVersion());
redactionLog.setDossierDictionaryVersion(dictionaryVersion.getDossierVersion());
excludeExcludedPages(redactionLog, analyzeRequest.getExcludedPages());
var redactionLogChange = redactionChangeLogService.computeChanges(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLogChange
.getRedactionLog());
long duration = System.currentTimeMillis() - startTime;
return AnalyzeResult.builder()
.dossierId(analyzeRequest.getDossierId())
.fileId(analyzeRequest.getFileId())
.duration(duration)
.numberOfPages(text.getNumberOfPages())
.hasUpdates(redactionLogChange.isHasChanges())
.analysisVersion(redactionServiceSettings.getAnalysisVersion())
.rulesVersion(redactionLog.getRulesVersion())
.dictionaryVersion(redactionLog.getDictionaryVersion())
.legalBasisVersion(redactionLog.getLegalBasisVersion())
.dossierDictionaryVersion(redactionLog.getDossierDictionaryVersion())
.wasReanalyzed(isReanalysis)
.build();
}
private Set<String> getRelevantManuallyModifiedAnnotationIds(ManualRedactions manualRedactions) {
if (manualRedactions == null) {
return new HashSet<>();
}
return Stream.concat(manualRedactions.getManualLegalBasisChanges()
.stream()
.map(ManualLegalBasisChange::getId), Stream.concat(manualRedactions.getImageRecategorizations()
.stream()
.map(ManualImageRecategorization::getId), Stream.concat(manualRedactions.getIdsToRemove()
.stream()
.map(IdRemoval::getId), manualRedactions.getForceRedacts().stream().map(ManualForceRedact::getId))))
.collect(Collectors.toSet());
}
public Image convert(RedactionLogEntry entry) {
Rectangle position = entry.getPositions().get(0);
return Image.builder()
.type(entry.getType())
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
.getY(), position.getWidth(), position.getHeight()))
.sectionNumber(entry.getSectionNumber())
.section(entry.getSection())
.page(position.getPage())
.hasTransparency(entry.isImageHasTransparency())
.build();
}
private void excludeExcludedPages(RedactionLog redactionLog, Set<Integer> excludedPages) {
if (excludedPages != null && !excludedPages.isEmpty()) {
redactionLog.getRedactionLogEntry().forEach(entry -> entry.getPositions().forEach(pos -> {
if (excludedPages.contains(pos.getPage())) {
entry.setExcluded(true);
}
}));
}
}
}

View File

@ -33,7 +33,7 @@ public class DictionaryService {
public DictionaryVersion updateDictionary(String dossierTemplateId, String dossierId) {
log.info("Updating dictionary data for: {} / {}", dossierTemplateId, dossierId);
log.info("Updating dictionary data for dossierTemplate {} and dossier {}", dossierTemplateId, dossierId);
long dossierTemplateDictionaryVersion = dictionaryClient.getVersion(dossierTemplateId, GLOBAL_DOSSIER);
var dossierTemplateDictionary = dictionariesByDossierTemplate.get(dossierTemplateId);
if (dossierTemplateDictionary == null || dossierTemplateDictionaryVersion > dossierTemplateDictionary.getDictionaryVersion()) {
@ -164,7 +164,6 @@ public class DictionaryService {
public float[] getColor(String type, String dossierTemplateId) {
log.info("requested : {} / {}",type,dossierTemplateId);
DictionaryModel model = dictionariesByDossierTemplate.get(dossierTemplateId).getLocalAccessMap().get(type);
if (model != null) {
return model.getColor();

View File

@ -1,74 +1,142 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.ManualImageRecategorization;
import com.iqser.red.service.redaction.v1.model.Status;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.EntityRecognitionClient;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecogintionEntity;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionResponse;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionSection;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Slf4j
@Service
@RequiredArgsConstructor
public class EntityRedactionService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
private final SurroundingWordsService surroundingWordsService;
private final EntityRecognitionClient entityRecognitionClient;
private final RedactionServiceSettings redactionServiceSettings;
private final DroolsExecutionService droolsExecutionService;
private final SurroundingWordsService surroundingWordsService;
public void processDocument(Document classifiedDoc, String dossierTemplateId, ManualRedactions manualRedactions,
String dossierId, List<FileAttribute> fileAttributes) {
public PageEntities findEntities(Dictionary dictionary, List<SectionText> sectionTexts, KieContainer kieContainer,
AnalyzeRequest analyzeRequest) {
dictionaryService.updateDictionary(dossierTemplateId, dossierId);
KieContainer container = droolsExecutionService.updateRules(dossierTemplateId);
long rulesVersion = droolsExecutionService.getRulesVersion(dossierTemplateId);
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(dossierTemplateId, dossierId);
Set<Entity> documentEntities = new HashSet<>(findEntities(classifiedDoc, container, manualRedactions, dictionary, false, null, fileAttributes));
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
Set<Entity> entities = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, false, null, imagesPerPage);
if (dictionary.hasLocalEntries()) {
Map<Integer, Set<Entity>> hintsPerSectionNumber = getHintsPerSection(documentEntities, dictionary);
Set<Entity> foundByLocal = findEntities(classifiedDoc, container, manualRedactions, dictionary, true, hintsPerSectionNumber, fileAttributes);
EntitySearchUtils.addEntitiesWithHigherRank(documentEntities, foundByLocal, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(documentEntities);
Map<Integer, Set<Entity>> hintsPerSectionNumber = getHintsPerSection(entities, dictionary);
Set<Entity> foundByLocal = findEntities(sectionTexts, dictionary, kieContainer, analyzeRequest, true, hintsPerSectionNumber, imagesPerPage);
EntitySearchUtils.addEntitiesWithHigherRank(entities, foundByLocal, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
classifiedDoc.setEntities(convertToEnititesPerPage(documentEntities));
dictionaryService.updateExternalDictionary(dictionary, dossierTemplateId);
classifiedDoc.setDictionaryVersion(dictionary.getVersion());
classifiedDoc.setRulesVersion(rulesVersion);
Map<Integer, List<Entity>> entitiesPerPage = convertToEnititesPerPage(entities);
return new PageEntities(entitiesPerPage, imagesPerPage);
}
public Map<Integer, List<Entity>> convertToEnititesPerPage(Set<Entity> entities) {
public Set<Entity> findEntities(List<SectionText> reanalysisSections, Dictionary dictionary,
KieContainer kieContainer, AnalyzeRequest analyzeRequest, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
Map<Integer, Set<Image>> imagesPerPage) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection
.getSectionNumber(), dictionary, local);
if (reanalysisSection.getCellStarts() != null && !reanalysisSection.getCellStarts().isEmpty()) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
if (!local && reanalysisSection.getImages() != null && !reanalysisSection.getImages()
.isEmpty() && analyzeRequest.getManualRedactions() != null && analyzeRequest.getManualRedactions()
.getImageRecategorizations() != null) {
for (Image image : reanalysisSection.getImages()) {
String imageId = IdBuilder.buildId(image.getPosition(), image.getPage());
for (ManualImageRecategorization imageRecategorization : analyzeRequest.getManualRedactions()
.getImageRecategorizations()) {
if (imageRecategorization.getStatus().equals(Status.APPROVED) && imageRecategorization.getId()
.equals(imageId)) {
image.setType(imageRecategorization.getType());
}
}
}
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream
.concat(entities.stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber())
.stream())
.collect(Collectors.toSet()) : entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.fileAttributes(analyzeRequest.getFileAttributes())
.build(), reanalysisSection.getSearchableText()));
}
Set<Entity> entities = new HashSet<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
EntitySearchUtils.removeEntitiesContainedInLarger(analysedSection.getEntities());
entities.addAll(analysedSection.getEntities());
if(!local) {
for (Image image : analysedSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
addLocalValuesToDictionary(analysedSection, dictionary);
}
});
return entities;
}
private Map<Integer, List<Entity>> convertToEnititesPerPage(Set<Entity> entities) {
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
@ -90,7 +158,7 @@ public class EntityRedactionService {
}
public Map<Integer, Set<Entity>> getHintsPerSection(Set<Entity> entities, Dictionary dictionary) {
private Map<Integer, Set<Entity>> getHintsPerSection(Set<Entity> entities, Dictionary dictionary) {
Map<Integer, Set<Entity>> hintsPerSectionNumber = new HashMap<>();
entities.stream().forEach(entity -> {
@ -102,64 +170,7 @@ public class EntityRedactionService {
}
private Set<Entity> findEntities(Document classifiedDoc, KieContainer kieContainer,
ManualRedactions manualRedactions, Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
List<FileAttribute> fileAttributes) {
Set<Entity> documentEntities = new HashSet<>();
AtomicInteger sectionNumber = new AtomicInteger(1);
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
List<Table> tables = paragraph.getTables();
for (Table table : tables) {
if (table.getColCount() == 2) {
sectionSearchableTextPairs.addAll(processTableAsOneText(classifiedDoc, table, sectionNumber, dictionary, local, hintsPerSectionNumber, fileAttributes));
} else {
sectionSearchableTextPairs.addAll(processTablePerRow(classifiedDoc, table, sectionNumber, dictionary, local, hintsPerSectionNumber, fileAttributes));
}
sectionNumber.incrementAndGet();
}
sectionSearchableTextPairs.add(processText(classifiedDoc, paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph
.getHeadline(), manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, paragraph
.getImages(), fileAttributes));
sectionNumber.incrementAndGet();
}
for (Header header : classifiedDoc.getHeaders()) {
sectionSearchableTextPairs.add(processText(classifiedDoc, header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>(), fileAttributes));
sectionNumber.incrementAndGet();
}
for (Footer footer : classifiedDoc.getFooters()) {
sectionSearchableTextPairs.add(processText(classifiedDoc, footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>(), fileAttributes));
sectionNumber.incrementAndGet();
}
for (UnclassifiedText unclassifiedText : classifiedDoc.getUnclassifiedTexts()) {
sectionSearchableTextPairs.add(processText(classifiedDoc, unclassifiedText.getSearchableText(), unclassifiedText
.getTextBlocks(), "", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>(), fileAttributes));
sectionNumber.incrementAndGet();
}
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
documentEntities.addAll(analysedSection.getEntities());
for (Image image : analysedSection.getImages()) {
classifiedDoc.getImages().computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
addLocalValuesToDictionary(analysedSection, dictionary);
});
return documentEntities;
}
public void addLocalValuesToDictionary(Section analysedSection, Dictionary dictionary) {
private void addLocalValuesToDictionary(Section analysedSection, Dictionary dictionary) {
analysedSection.getLocalDictionaryAdds().keySet().forEach(key -> {
if (dictionary.isRecommendation(key)) {
@ -186,206 +197,7 @@ public class EntityRedactionService {
}
private List<SectionSearchableTextPair> processTablePerRow(Document classifiedDoc, Table table,
AtomicInteger sectionNumber, Dictionary dictionary,
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
List<FileAttribute> fileAttributes) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
SectionText sectionText = new SectionText();
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage());
sectionText.getSectionAreas().add(sectionArea);
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
int cellStart = start;
if (!cell.isHeaderCell()) {
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
sectionArea.setHeader(headerName);
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
}
for (TextBlock textBlock : cell.getTextBlocks()) {
// TODO avoid cell overlap merging.
searchableRow.addAll(textBlock.getSequences());
}
cellStarts.add(cellStart);
start = start + cell.toString().trim().length() + 1;
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream
.concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
.collect(Collectors.toSet()) : rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString())
.headline(table.getHeadline())
.sectionNumber(sectionNumber.intValue())
.tabularData(tabularData)
.searchableText(searchableRow)
.dictionary(dictionary)
.fileAttributes(fileAttributes)
.build(), searchableRow));
if (!local) {
sectionText.setText(searchableRow.toString());
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
sectionText.setTabularData(tabularData);
sectionText.setCellStarts(cellStarts);
classifiedDoc.getSectionText().add(sectionText);
}
sectionNumber.incrementAndGet();
}
return sectionSearchableTextPairs;
}
private List<SectionSearchableTextPair> processTableAsOneText(Document classifiedDoc, Table table,
AtomicInteger sectionNumber, Dictionary dictionary,
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
List<FileAttribute> fileAttributes) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
SearchableText entireTableText = new SearchableText();
SectionText sectionText = new SectionText();
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
if (!local) {
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage());
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
sectionText.getSectionAreas().add(sectionArea);
}
for (TextBlock textBlock : cell.getTextBlocks()) {
entireTableText.addAll(textBlock.getSequences());
}
}
}
Set<Entity> rowEntities = findEntities(entireTableText, table.getHeadline(), sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(rowEntities, entireTableText, dictionary);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream
.concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
.collect(Collectors.toSet()) : rowEntities)
.text(entireTableText.getAsStringWithLinebreaks())
.searchText(entireTableText.toString())
.headline(table.getHeadline())
.sectionNumber(sectionNumber.intValue())
.searchableText(entireTableText)
.dictionary(dictionary)
.fileAttributes(fileAttributes)
.build(), entireTableText));
if (!local) {
sectionText.setText(entireTableText.toString());
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
classifiedDoc.getSectionText().add(sectionText);
}
return sectionSearchableTextPairs;
}
private SectionSearchableTextPair processText(Document classifiedDoc, SearchableText searchableText,
List<TextBlock> paragraphTextBlocks, String headline,
ManualRedactions manualRedactions, AtomicInteger sectionNumber,
Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
List<PdfImage> images, List<FileAttribute> fileAttributes) {
if (!local) {
SectionText sectionText = new SectionText();
for (TextBlock paragraphTextBlock : paragraphTextBlocks) {
SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock
.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage());
sectionText.getSectionAreas().add(sectionArea);
}
sectionText.setText(searchableText.toString());
sectionText.setHeadline(headline);
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(false);
sectionText.setImages(images.stream()
.map(image -> convertAndRecategorize(image, sectionNumber.intValue(), headline, manualRedactions))
.collect(Collectors.toSet()));
sectionText.setTextBlocks(paragraphTextBlocks);
classifiedDoc.getSectionText().add(sectionText);
}
Set<Entity> entities = findEntities(searchableText, headline, sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
return new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream
.concat(entities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
.collect(Collectors.toSet()) : entities)
.text(searchableText.getAsStringWithLinebreaks())
.searchText(searchableText.toString())
.headline(headline)
.sectionNumber(sectionNumber.intValue())
.searchableText(searchableText)
.dictionary(dictionary)
.images(images.stream()
.map(image -> convertAndRecategorize(image, sectionNumber.intValue(), headline, manualRedactions))
.collect(Collectors.toSet()))
.fileAttributes(fileAttributes)
.build(), searchableText);
}
public Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
Dictionary dictionary, boolean local) {
Set<Entity> found = new HashSet<>();
@ -413,34 +225,6 @@ public class EntityRedactionService {
}
private Image convertAndRecategorize(PdfImage pdfImage, int sectionNumber, String headline,
ManualRedactions manualRedactions) {
Image image = Image.builder()
.type(pdfImage.getImageType().equals(ImageType.OTHER) ? "image" : pdfImage.getImageType()
.name()
.toLowerCase(Locale.ROOT))
.position(pdfImage.getPosition())
.sectionNumber(sectionNumber)
.section(headline)
.page(pdfImage.getPage())
.hasTransparency(pdfImage.isHasTransparency())
.build();
String imageId = IdBuilder.buildId(image.getPosition(), image.getPage());
if (manualRedactions != null && manualRedactions.getImageRecategorizations() != null) {
for (ManualImageRecategorization imageRecategorization : manualRedactions.getImageRecategorizations()) {
if (imageRecategorization.getStatus().equals(Status.APPROVED) && imageRecategorization.getId()
.equals(imageId)) {
image.setType(imageRecategorization.getType());
}
}
}
return image;
}
private Set<Entity> getAiEntities(int sectionNumber, String searchableString, String headline) {
Set<Entity> found = new HashSet<>();

View File

@ -1,321 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.file.management.v1.api.model.FileType;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.Text;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Slf4j
@Service
@RequiredArgsConstructor
public class ReanalyzeService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
private final SurroundingWordsService surroundingWordsService;
private final EntityRedactionService entityRedactionService;
private final RedactionLogCreatorService redactionLogCreatorService;
private final RedactionStorageService redactionStorageService;
private final PdfSegmentationService pdfSegmentationService;
private final RedactionChangeLogService redactionChangeLogService;
private final AnalyzeResponseService analyzeResponseService;
private final LegalBasisClient legalBasisClient;
private final RedactionServiceSettings redactionServiceSettings;
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var pageCount = 0;
Document classifiedDoc;
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest
.getDossierId(), analyzeRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
pageCount = classifiedDoc.getPages().size();
} catch (Exception e) {
throw new RedactionException(e);
}
log.info("Document structure analysis successful, starting redaction analysis...");
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getDossierTemplateId(), analyzeRequest.getManualRedactions(), analyzeRequest
.getDossierId(), analyzeRequest.getFileAttributes());
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getDossierTemplateId());
log.info("Redaction analysis successful...");
var legalBasis = legalBasisClient.getLegalBasisMapping(analyzeRequest.getDossierTemplateId());
var redactionLog = new RedactionLog(redactionServiceSettings.getAnalysisVersion(), classifiedDoc.getRedactionLogEntities(), legalBasis, classifiedDoc.getDictionaryVersion()
.getDossierTemplateVersion(), classifiedDoc.getDictionaryVersion()
.getDossierVersion(), classifiedDoc.getRulesVersion(), legalBasisClient.getVersion(analyzeRequest.getDossierTemplateId()));
excludeExcludedPages(redactionLog, analyzeRequest.getExcludedPages());
log.info("Analyzed with rules {} and dictionary {} for dossierTemplate: {}", classifiedDoc.getRulesVersion(), classifiedDoc
.getDictionaryVersion(), analyzeRequest.getDossierTemplateId());
var redactionLogChange = redactionChangeLogService.computeChanges(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), redactionLog);
redactionLog = redactionLogChange.getRedactionLog();
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.TEXT, new Text(pageCount, classifiedDoc
.getSectionText()));
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.SECTION_GRID, classifiedDoc
.getSectionGrid());
long duration = System.currentTimeMillis() - startTime;
return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), duration, pageCount, redactionLog, redactionLogChange.isHasChanges());
}
@SneakyThrows
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
long startTime = System.currentTimeMillis();
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
var text = redactionStorageService.getText(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
// not yet ready for reanalysis
if (redactionLog == null || text == null || text.getNumberOfPages() == 0) {
return analyze(analyzeRequest);
}
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getDossierTemplateId(), new DictionaryVersion(redactionLog
.getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getDossierId());
Set<Integer> sectionsToReanalyse = !analyzeRequest.getSectionsToReanalyse().isEmpty() ? analyzeRequest.getSectionsToReanalyse() :
findSectionsToReanalyse(dictionaryIncrement, redactionLog, text, analyzeRequest);
if (sectionsToReanalyse.isEmpty()) {
return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement);
}
List<SectionText> reanalysisSections = text.getSectionTexts()
.stream()
.filter(sectionText -> sectionsToReanalyse.contains(sectionText.getSectionNumber()))
.collect(Collectors.toList());
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest
.getDossierId());
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
Set<Entity> entities = findEntities(reanalysisSections, dictionary, kieContainer, analyzeRequest, false, null, imagesPerPage);
if (dictionary.hasLocalEntries()) {
Map<Integer, Set<Entity>> hintsPerSectionNumber = entityRedactionService.getHintsPerSection(entities, dictionary);
Set<Entity> foundByLocal = findEntities(reanalysisSections, dictionary, kieContainer, analyzeRequest, true, hintsPerSectionNumber, imagesPerPage);
EntitySearchUtils.addEntitiesWithHigherRank(entities, foundByLocal, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
Map<Integer, List<Entity>> entitiesPerPage = entityRedactionService.convertToEnititesPerPage(entities);
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= text.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, page, analyzeRequest
.getDossierTemplateId()));
}
if (imagesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, page, analyzeRequest
.getDossierTemplateId()));
}
}
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()));
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
AnalyzeResult analyzeResult = finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement);
analyzeResult.setWasReanalyzed(true);
return analyzeResult;
}
private Set<Integer> findSectionsToReanalyse(DictionaryIncrement dictionaryIncrement, RedactionLog redactionLog,
Text text, AnalyzeRequest analyzeRequest) {
Set<String> relevantManuallyModifiedAnnotationIds = getRelevantManuallyModifiedAnnotationIds(analyzeRequest.getManualRedactions());
Set<Integer> sectionsToReanalyse = new HashSet<>();
Map<Integer, Set<Image>> imageEntries = new HashMap<>();
for (RedactionLogEntry entry : redactionLog.getRedactionLogEntry()) {
if (entry.isManual() || relevantManuallyModifiedAnnotationIds.contains(entry.getId())) {
sectionsToReanalyse.add(entry.getSectionNumber());
}
if (entry.isImage() || entry.getType().equals("image")) {
imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry));
}
}
for (SectionText sectionText : text.getSectionTexts()) {
if (EntitySearchUtils.sectionContainsAny(sectionText.getText(), dictionaryIncrement.getValues())) {
sectionsToReanalyse.add(sectionText.getSectionNumber());
}
}
log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest);
return sectionsToReanalyse;
}
private Set<Entity> findEntities(List<SectionText> reanalysisSections, Dictionary dictionary,
KieContainer kieContainer, AnalyzeRequest analyzeRequest, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
Map<Integer, Set<Image>> imagesPerPage) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, local);
if (reanalysisSection.getCellStarts() != null && !reanalysisSection.getCellStarts().isEmpty()) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
if (!local && reanalysisSection.getImages() != null && !reanalysisSection.getImages()
.isEmpty() && analyzeRequest.getManualRedactions() != null && analyzeRequest.getManualRedactions()
.getImageRecategorizations() != null) {
for (Image image : reanalysisSection.getImages()) {
String imageId = IdBuilder.buildId(image.getPosition(), image.getPage());
for (ManualImageRecategorization imageRecategorization : analyzeRequest.getManualRedactions()
.getImageRecategorizations()) {
if (imageRecategorization.getStatus().equals(Status.APPROVED) && imageRecategorization.getId()
.equals(imageId)) {
image.setType(imageRecategorization.getType());
}
}
}
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream
.concat(entities.stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber())
.stream())
.collect(Collectors.toSet()) : entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.fileAttributes(analyzeRequest.getFileAttributes())
.build(), reanalysisSection.getSearchableText()));
}
Set<Entity> entities = new HashSet<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
entities.addAll(analysedSection.getEntities());
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
for (Image image : analysedSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
entityRedactionService.addLocalValuesToDictionary(analysedSection, dictionary);
});
return entities;
}
private AnalyzeResult finalizeAnalysis(@RequestBody AnalyzeRequest analyzeRequest, long startTime,
RedactionLog redactionLog, Text text,
DictionaryIncrement dictionaryIncrement) {
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getDossierTemplateVersion());
redactionLog.setDossierDictionaryVersion(dictionaryIncrement.getDictionaryVersion().getDossierVersion());
excludeExcludedPages(redactionLog, analyzeRequest.getExcludedPages());
var redactionLogChange = redactionChangeLogService.computeChanges(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLogChange.getRedactionLog());
long duration = System.currentTimeMillis() - startTime;
return analyzeResponseService.createAnalyzeResponse(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), duration, text
.getNumberOfPages(), redactionLogChange.getRedactionLog(), redactionLogChange.isHasChanges());
}
private Set<String> getRelevantManuallyModifiedAnnotationIds(ManualRedactions manualRedactions) {
if (manualRedactions == null) {
return new HashSet<>();
}
return Stream.concat(manualRedactions.getManualLegalBasisChanges()
.stream()
.map(ManualLegalBasisChange::getId), Stream.concat(manualRedactions.getImageRecategorizations()
.stream()
.map(ManualImageRecategorization::getId), Stream.concat(manualRedactions.getIdsToRemove()
.stream()
.map(IdRemoval::getId), manualRedactions.getForceRedacts().stream().map(ManualForceRedact::getId))))
.collect(Collectors.toSet());
}
public Image convert(RedactionLogEntry entry) {
Rectangle position = entry.getPositions().get(0);
return Image.builder()
.type(entry.getType())
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
.getY(), position.getWidth(), position.getHeight()))
.sectionNumber(entry.getSectionNumber())
.section(entry.getSection())
.page(position.getPage())
.hasTransparency(entry.isImageHasTransparency())
.build();
}
private void excludeExcludedPages(RedactionLog redactionLog, Set<Integer> excludedPages) {
if(excludedPages != null && !excludedPages.isEmpty()) {
redactionLog.getRedactionLogEntry().forEach(entry -> entry.getPositions().forEach(pos -> { if (excludedPages.contains(pos.getPage())) {
entry.setExcluded(true);
}}));
}
}
}

View File

@ -31,6 +31,8 @@ public class RedactionChangeLogService {
public RedactionLogChanges computeChanges(String dossierId, String fileId, RedactionLog currentRedactionLog) {
long start = System.currentTimeMillis();
RedactionLog previousRedactionLog = redactionStorageService.getRedactionLog(dossierId, fileId);
if (previousRedactionLog == null) {
@ -98,6 +100,7 @@ public class RedactionChangeLogService {
currentRedactionLog.setRedactionLogEntry(newRedactionLogEntries);
log.info("Change computation took: {}", System.currentTimeMillis() - start);
return new RedactionLogChanges(currentRedactionLog, !addedIds.isEmpty() || !removedIds.isEmpty());
}

View File

@ -1,26 +1,5 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.model.CellRectangle;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@ -28,6 +7,22 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class RedactionLogCreatorService {
@ -35,26 +30,27 @@ public class RedactionLogCreatorService {
private final DictionaryService dictionaryService;
public void createRedactionLog(Document classifiedDoc, int numberOfPages, String dossierTemplateId) {
public List<RedactionLogEntry> createRedactionLog(PageEntities pageEntities, int numberOfPages,
String dossierTemplateId) {
List<RedactionLogEntry> entries = new ArrayList<>();
for (int page = 1; page <= numberOfPages; page++) {
addSectionGrid(classifiedDoc, page);
if (classifiedDoc.getEntities().get(page) != null) {
classifiedDoc.getRedactionLogEntities()
.addAll(addEntries(classifiedDoc.getEntities(), page, dossierTemplateId));
if (pageEntities.getEntitiesPerPage().get(page) != null) {
entries.addAll(addEntries(pageEntities.getEntitiesPerPage(), page, dossierTemplateId));
}
if (classifiedDoc.getImages().get(page) != null && !classifiedDoc.getImages().get(page).isEmpty()) {
classifiedDoc.getRedactionLogEntities()
.addAll(addImageEntries(classifiedDoc.getImages(), page, dossierTemplateId));
if (pageEntities.getImagesPerPage().get(page) != null) {
entries.addAll(addImageEntries(pageEntities.getImagesPerPage(), page, dossierTemplateId));
}
}
return entries;
}
public List<RedactionLogEntry> addImageEntries(Map<Integer, Set<Image>> images, int pageNumber, String dossierTemplateId) {
public List<RedactionLogEntry> addImageEntries(Map<Integer, Set<Image>> images, int pageNumber,
String dossierTemplateId) {
List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
@ -83,7 +79,6 @@ public class RedactionLogCreatorService {
.imageHasTransparency(image.isHasTransparency())
.build();
redactionLogEntities.add(redactionLogEntry);
}
@ -101,7 +96,6 @@ public class RedactionLogCreatorService {
entityLoop:
for (Entity entity : entities.get(page)) {
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
RedactionLogEntry redactionLogEntry = createRedactionLogEntry(entity, dossierTemplateId);
@ -121,12 +115,10 @@ public class RedactionLogCreatorService {
.flatMap(seq -> seq.getTextPositions().stream())
.collect(Collectors.toList()), page);
redactionLogEntry.getPositions().addAll(rectanglesPerLine);
}
// FIXME ids should never be null. Figure out why this happens.
if (redactionLogEntry.getId() != null) {
redactionLogEntities.add(redactionLogEntry);
@ -189,50 +181,6 @@ public class RedactionLogCreatorService {
}
private void addSectionGrid(Document classifiedDoc, int page) {
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i);
if (textBlock.getPage() != page) {
continue;
}
if (textBlock instanceof TextBlock) {
classifiedDoc.getSectionGrid()
.getRectanglesPerPage()
.computeIfAbsent(page, (x) -> new ArrayList<>())
.add(new SectionRectangle(new Point(textBlock.getMinX(), textBlock.getMinY()), textBlock.getWidth(), textBlock
.getHeight(), i + 1, paragraph.getPageBlocks().size()));
} else if (textBlock instanceof Table) {
List<CellRectangle> cellRectangles = new ArrayList<>();
for (List<Cell> row : ((Table) textBlock).getRows()) {
for (Cell cell : row) {
if (cell != null) {
cellRectangles.add(new CellRectangle(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight()));
}
}
}
classifiedDoc.getSectionGrid()
.getRectanglesPerPage()
.computeIfAbsent(page, (x) -> new ArrayList<>())
.add(new SectionRectangle(new Point(textBlock.getMinX(), textBlock.getMinY()), textBlock.getWidth(), textBlock
.getHeight(), i + 1, paragraph.getPageBlocks().size(), cellRectangles));
}
}
}
}
private float[] getColor(String type, String dossierTemplateId, boolean isRedaction) {
if (!isRedaction && !isHint(type, dossierTemplateId)) {
@ -253,5 +201,4 @@ public class RedactionLogCreatorService {
return dictionaryService.isRecommendation(type, dossierTemplateId);
}
}

View File

@ -0,0 +1,76 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.CellRectangle;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.SectionRectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class SectionGridCreatorService {
public void createSectionGrid(Document classifiedDoc, int numberOfPages) {
for (int page = 1; page <= numberOfPages; page++) {
addSectionGrid(classifiedDoc, page);
}
}
private void addSectionGrid(Document classifiedDoc, int page) {
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
for (int i = 0; i <= paragraph.getPageBlocks().size() - 1; i++) {
AbstractTextContainer textBlock = paragraph.getPageBlocks().get(i);
if (textBlock.getPage() != page) {
continue;
}
if (textBlock instanceof TextBlock) {
classifiedDoc.getSectionGrid()
.getRectanglesPerPage()
.computeIfAbsent(page, (x) -> new ArrayList<>())
.add(new SectionRectangle(new Point(textBlock.getMinX(), textBlock.getMinY()), textBlock.getWidth(), textBlock
.getHeight(), i + 1, paragraph.getPageBlocks().size()));
} else if (textBlock instanceof Table) {
List<CellRectangle> cellRectangles = new ArrayList<>();
for (List<Cell> row : ((Table) textBlock).getRows()) {
for (Cell cell : row) {
if (cell != null) {
cellRectangles.add(new CellRectangle(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight()));
}
}
}
classifiedDoc.getSectionGrid()
.getRectanglesPerPage()
.computeIfAbsent(page, (x) -> new ArrayList<>())
.add(new SectionRectangle(new Point(textBlock.getMinX(), textBlock.getMinY()), textBlock.getWidth(), textBlock
.getHeight(), i + 1, paragraph.getPageBlocks().size(), cellRectangles));
}
}
}
}
}

View File

@ -0,0 +1,210 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class SectionTextBuilderService {
public List<SectionText> buildSectionText(Document classifiedDoc) {
List<SectionText> sectionTexts = new ArrayList<>();
AtomicInteger sectionNumber = new AtomicInteger(1);
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
List<Table> tables = paragraph.getTables();
for (Table table : tables) {
if (table.getColCount() == 2) {
sectionTexts.add(processTableAsOneText(table, sectionNumber));
} else {
sectionTexts.addAll(processTablePerRow(table, sectionNumber));
}
sectionNumber.incrementAndGet();
}
sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph
.getImages()));
sectionNumber.incrementAndGet();
}
for (Header header : classifiedDoc.getHeaders()) {
sectionTexts.add(processText(header.getSearchableText(), header.getTextBlocks(), "Header", sectionNumber, new ArrayList<>()));
sectionNumber.incrementAndGet();
}
for (Footer footer : classifiedDoc.getFooters()) {
sectionTexts.add(processText(footer.getSearchableText(), footer.getTextBlocks(), "Footer", sectionNumber, new ArrayList<>()));
sectionNumber.incrementAndGet();
}
for (UnclassifiedText unclassifiedText : classifiedDoc.getUnclassifiedTexts()) {
sectionTexts.add(processText(unclassifiedText.getSearchableText(), unclassifiedText.getTextBlocks(), "", sectionNumber, new ArrayList<>()));
sectionNumber.incrementAndGet();
}
return sectionTexts;
}
private List<SectionText> processTablePerRow(Table table, AtomicInteger sectionNumber) {
List<SectionText> sectionTexts = new ArrayList<>();
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
SectionText sectionText = new SectionText();
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage());
sectionText.getSectionAreas().add(sectionArea);
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
int cellStart = start;
if (!cell.isHeaderCell()) {
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
sectionArea.setHeader(headerName);
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
}
for (TextBlock textBlock : cell.getTextBlocks()) {
// TODO avoid cell overlap merging.
searchableRow.addAll(textBlock.getSequences());
}
cellStarts.add(cellStart);
start = start + cell.toString().trim().length() + 1;
}
sectionText.setText(searchableRow.toString());
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
sectionText.setTabularData(tabularData);
sectionText.setCellStarts(cellStarts);
sectionTexts.add(sectionText);
sectionNumber.incrementAndGet();
}
return sectionTexts;
}
private SectionText processTableAsOneText(Table table, AtomicInteger sectionNumber) {
SearchableText entireTableText = new SearchableText();
SectionText sectionText = new SectionText();
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage());
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
sectionText.getSectionAreas().add(sectionArea);
for (TextBlock textBlock : cell.getTextBlocks()) {
entireTableText.addAll(textBlock.getSequences());
}
}
}
sectionText.setText(entireTableText.toString());
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
return sectionText;
}
private SectionText processText(SearchableText searchableText, List<TextBlock> paragraphTextBlocks, String headline,
AtomicInteger sectionNumber, List<PdfImage> images) {
SectionText sectionText = new SectionText();
for (TextBlock paragraphTextBlock : paragraphTextBlocks) {
SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock
.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage());
sectionText.getSectionAreas().add(sectionArea);
}
sectionText.setText(searchableText.toString());
sectionText.setHeadline(headline);
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(false);
sectionText.setImages(images.stream()
.map(image -> convertImage(image, sectionNumber.intValue(), headline))
.collect(Collectors.toSet()));
sectionText.setTextBlocks(paragraphTextBlocks);
return sectionText;
}
private Image convertImage(PdfImage pdfImage, int sectionNumber, String headline) {
return Image.builder()
.type(pdfImage.getImageType().equals(ImageType.OTHER) ? "image" : pdfImage.getImageType()
.name()
.toLowerCase(Locale.ROOT))
.position(pdfImage.getPosition())
.sectionNumber(sectionNumber)
.section(headline)
.page(pdfImage.getPage())
.hasTransparency(pdfImage.isHasTransparency())
.build();
}
}

View File

@ -13,7 +13,7 @@ import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
@ -90,7 +90,7 @@ public class RedactionIntegrationTest {
private RedactionController redactionController;
@Autowired
private ReanalyzeService reanalyzeService;
private AnalyzeService analyzeService;
@Autowired
private ObjectMapper objectMapper;
@ -524,7 +524,8 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage("files/Minimal Examples/270Rotated.pdf");
MemoryStats.printMemoryStats();
AnalyzeResult result = reanalyzeService.analyze(request);
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
assertThat(result).isNotNull();
}
@ -535,7 +536,8 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf");
MemoryStats.printMemoryStats();
AnalyzeResult result = reanalyzeService.analyze(request);
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
assertThat(result).isNotNull();
}
@ -547,7 +549,8 @@ public class RedactionIntegrationTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/merge_images.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request);
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
@ -573,7 +576,7 @@ public class RedactionIntegrationTest {
fileOutputStream.write(annotateResponse.getDocument());
}
long rstart = System.currentTimeMillis();
reanalyzeService.reanalyze(request);
analyzeService.reanalyze(request);
long rend = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (rend - rstart));
@ -602,7 +605,11 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage(new FileInputStream((path)));
System.out.println("Redacting file : " + path.getName());
AnalyzeResult result = reanalyzeService.analyze(request);
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
long fstart = System.currentTimeMillis();
AnalyzeResult result = analyzeService.analyze(request);
System.out.println("analysis analysis duration: " + (System.currentTimeMillis() - fstart));
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
@ -620,7 +627,7 @@ public class RedactionIntegrationTest {
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(1L);
long rstart = System.currentTimeMillis();
reanalyzeService.reanalyze(request);
analyzeService.reanalyze(request);
long rend = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (rend - rstart));
@ -667,7 +674,8 @@ public class RedactionIntegrationTest {
.value("true")
.build()));
AnalyzeResult result = reanalyzeService.analyze(request);
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var text = redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID);
@ -741,7 +749,7 @@ public class RedactionIntegrationTest {
request.setManualRedactions(manualRedactions);
AnalyzeResult reanalyzeResult = reanalyzeService.reanalyze(request);
AnalyzeResult reanalyzeResult = analyzeService.reanalyze(request);
redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
@ -766,7 +774,7 @@ public class RedactionIntegrationTest {
when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER))
.thenReturn(getDictionaryResponse(VERTEBRATE, false));
reanalyzeService.reanalyze(request);
analyzeService.reanalyze(request);
redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
@ -781,7 +789,8 @@ public class RedactionIntegrationTest {
long start = System.currentTimeMillis();
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
AnalyzeResult result = reanalyzeService.analyze(request);
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)
@ -842,7 +851,8 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setManualRedactions(manualRedactions);
AnalyzeResult result = reanalyzeService.analyze(request);
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
manualRedactions.setIdsToRemove(Set.of(IdRemoval.builder()
@ -855,7 +865,7 @@ public class RedactionIntegrationTest {
.status(Status.APPROVED)
.build()));
reanalyzeService.reanalyze(request);
analyzeService.reanalyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
@ -969,7 +979,8 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request);
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
@ -1017,7 +1028,8 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request);
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)

View File

@ -1,511 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.amazonaws.services.s3.AmazonS3;
import com.iqser.red.service.configuration.v1.api.model.*;
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
import com.iqser.red.service.redaction.v1.server.Application;
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.storage.commons.service.StorageService;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.KieServices;
import org.kie.api.builder.KieBuilder;
import org.kie.api.builder.KieFileSystem;
import org.kie.api.builder.KieModule;
import org.kie.api.runtime.KieContainer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
@RunWith(SpringRunner.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(EntityRedactionServiceTest.RedactionIntegrationTestConfiguration.class)
public class EntityRedactionServiceTest {
private static final String DEFAULT_RULES = loadFromClassPath("drools/rules.drl");
private static final String AUTHOR_CODE = "author";
private static final String ADDRESS_CODE = "address";
private static final String SPONSOR_CODE = "sponsor";
private static final AtomicLong DICTIONARY_VERSION = new AtomicLong();
private static final AtomicLong RULES_VERSION = new AtomicLong();
@MockBean
private DictionaryClient dictionaryClient;
@MockBean
private RulesClient rulesClient;
@Autowired
private EntityRedactionService entityRedactionService;
@Autowired
private PdfSegmentationService pdfSegmentationService;
@Autowired
private DroolsExecutionService droolsExecutionService;
@MockBean
private AmazonS3 amazonS3;
@MockBean
private LegalBasisClient legalBasisClient;
private final static String TEST_DOSSIER_TEMPLATE_ID = "123";
@Configuration
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
public static class RedactionIntegrationTestConfiguration {
@Bean
public KieContainer kieContainer() {
KieServices kieServices = KieServices.Factory.get();
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
InputStream input = new ByteArrayInputStream(DEFAULT_RULES.getBytes(StandardCharsets.UTF_8));
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources()
.newInputStreamResource(input));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll();
KieModule kieModule = kieBuilder.getKieModule();
return kieServices.newKieContainer(kieModule.getReleaseId());
}
@Bean
@Primary
public StorageService inmemoryStorage() {
return new FileSystemBackedStorageService();
}
}
@Test
public void testNestedEntitiesRemoval() {
Set<Entity> entities = new HashSet<>();
Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false, false);
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false, false);
entities.add(nested);
entities.add(nesting);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
assertThat(entities.size()).isEqualTo(1);
assertThat(entities).contains(nesting);
}
@Test
public void testTableRedaction() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@Test
public void testNestedRedaction() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/nested_redaction.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@Test
public void testTrueNegativesInTable() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Cyprodinil/40 Cyprodinil - EU AIR3 - LCA Section 1" +
" Supplement - Identity of the active substance - Reference list.pdf");
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " +
"the plant protection product.pdf");
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
}
@Test
public void testFalsePositiveInWrongCell() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf");
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 9)
.count()).isEqualTo(10);
}
@Test
public void testApplicantInTableRedaction() throws IOException {
String tableRules = "package drools\n" +
"\n" +
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
"\n" +
"global Section section\n" +
"rule \"6: Redact contact information if applicant is found\"\n" +
" when\n" +
" eval(section.headlineContainsWord(\"applicant\") || section.getText().contains(\"Applicant\"));\n" +
" then\n" +
" section.redactLineAfter(\"Name:\", \"address\", 6,true, \"Applicant information was found\", \"Reg" +
" (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactBetween(\"Address:\", \"Contact\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Contact point:\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Phone:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Fax:\", \"address\", 6,true, \"Applicant information was found\", \"Reg " +
"(EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Tel.:\", \"address\", 6,true, \"Applicant information was found\", \"Reg" +
" (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Tel:\", \"address\", 6,true, \"Applicant information was found\", \"Reg " +
"(EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"E-mail:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Email:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Contact:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Telephone number:\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Fax number:\", \"address\", 6,true, \"Applicant information was found\"," +
" \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Telephone:\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactBetween(\"No:\", \"Fax\", \"address\", 6,true, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactBetween(\"Contact:\", \"Tel.:\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" end";
when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(RULES_VERSION.incrementAndGet());
when(rulesClient.getRules(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(new RulesResponse(tableRules));
droolsExecutionService.updateRules(TEST_DOSSIER_TEMPLATE_ID);
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf");
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 6)
.count()).isEqualTo(13);
}
@Test
public void testSponsorInCell() throws IOException {
String tableRules = "package drools\n" +
"\n" +
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
"\n" +
"global Section section\n" + "rule \"11: Redact sponsor company\"\n" + " when\n" + " " +
"Section(searchText.toLowerCase().contains(\"batches produced at\"))\n" + " then\n" + " section" +
".redactIfPrecededBy(\"batches produced at\", \"sponsor\", 11, \"Redacted because it represents a " +
"sponsor company\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + " end";
when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(RULES_VERSION.incrementAndGet());
when(rulesClient.getRules(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(new RulesResponse(tableRules));
droolsExecutionService.updateRules(TEST_DOSSIER_TEMPLATE_ID);
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/batches_new_line.pdf");
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse authorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(authorResponse);
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 11)
.count()).isEqualTo(1);
}
@Test
public void headerPropagation() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Bissig R.", "Thanei P.")))
.build();
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf");
dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Arrays.asList("Tribolet, R.", "Muir, G.", "Kühne-Thu, H.", "Close, C.")))
.build();
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
}
@Test
@Ignore
public void testNGuideline() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Empty Tabular Data.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Aldershof S.")))
.build();
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
}
@Before
public void stubRedaction() {
String tableRules = "package drools\n" +
"\n" +
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
"\n" +
"global Section section\n" +
"rule \"8: Not redacted because Vertebrate Study = N\"\n" +
" when\n" +
" Section(rowEquals(\"Vertebrate study Y/N\", \"N\") || rowEquals(\"Vertebrate study Y/N\", \"No\"))\n" +
" then\n" +
" section.redactNotCell(\"Author(s)\", 8, \"name\", false, \"Not redacted because row is not a vertebrate study\");\n" +
" section.redactNot(\"address\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 8, \"hint_only\");\n" +
" end\n" +
"rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" +
" when\n" +
" Section(rowEquals(\"Vertebrate study Y/N\", \"Y\") || rowEquals(\"Vertebrate study Y/N\", " +
"\"Yes\"))\n" +
" then\n" +
" section.redactCell(\"Author(s)\", 9, \"name\", false, \"Redacted because row is a vertebrate study\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redact(\"address\", 9, \"Redacted because row is a vertebrate sgitudy\", \"Reg (EC) No" +
" 1107/2009 Art. 63 (2g)\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" +
" end";
when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(RULES_VERSION.incrementAndGet());
when(rulesClient.getRules(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(new RulesResponse(tableRules));
TypeResponse typeResponse = TypeResponse.builder()
.types(Arrays.asList(
TypeResult.builder().dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID).type(AUTHOR_CODE).hexColor("#ffff00").build(),
TypeResult.builder().dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID).type(ADDRESS_CODE).hexColor("#ff00ff").build(),
TypeResult.builder().dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID).type(SPONSOR_CODE).hexColor("#00ffff").build()))
.build();
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getAllTypes(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(typeResponse);
// Default empty return to prevent NPEs
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Colors colors = new Colors();
colors.setDefaultColor("#acfc00");
colors.setNotRedacted("#cccccc");
colors.setRequestAdd("#04b093");
colors.setRequestRemove("#04b093");
when(dictionaryClient.getColors(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(colors);
}
private static String loadFromClassPath(String path) {
URL resource = ResourceLoader.class.getClassLoader().getResource(path);
if (resource == null) {
throw new IllegalArgumentException("could not load classpath resource: drools/rules.drl");
}
try (BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), StandardCharsets.UTF_8))) {
StringBuilder sb = new StringBuilder();
String str;
while ((str = br.readLine()) != null) {
sb.append(str).append("\n");
}
return sb.toString();
} catch (IOException e) {
throw new IllegalArgumentException("could not load classpath resource: " + path, e);
}
}
private List<DictionaryEntry> toDictionaryEntry(List<String> entries) {
List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
entries.forEach(entry -> {
dictionaryEntries.add(new DictionaryEntry(entry, 1L, false));
});
return dictionaryEntries;
}
}

View File

@ -0,0 +1,29 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import static org.assertj.core.api.Assertions.assertThat;
import java.util.HashSet;
import java.util.Set;
import org.junit.Test;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
public class EntitySearchUtilsTest {
@Test
public void testNestedEntitiesRemoval() {
Set<Entity> entities = new HashSet<>();
Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0, false, false);
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false, false);
entities.add(nested);
entities.add(nesting);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
assertThat(entities.size()).isEqualTo(1);
assertThat(entities).contains(nesting);
}
}