From 0832ea15c09335cfb0d9f0139764ab6c99a22423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Mon, 26 Jul 2021 14:54:13 +0200 Subject: [PATCH] RED-1806: Use localDictionary also in Reanlysis to fix problem at removing values from false_positive that affect rules that should find values in entire document --- .../service/EntityRedactionService.java | 115 ++++++++------ .../redaction/service/ReanalyzeService.java | 145 ++++++++++-------- .../v1/server/RedactionIntegrationTest.java | 102 +++++++----- .../resources/dictionaries/CBI_author.txt | 1 - .../resources/dictionaries/false_positive.txt | 3 +- 5 files changed, 211 insertions(+), 155 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index ea1079a4..c1dc3f11 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -43,35 +43,13 @@ public class EntityRedactionService { if (dictionary.hasLocalEntries()) { - Map> hintsPerSectionNumber = new HashMap<>(); - documentEntities.stream().forEach(entity -> { - if (dictionary.isHint(entity.getType()) && entity.isDictionaryEntry()) { - hintsPerSectionNumber.computeIfAbsent(entity.getSectionNumber(), (x) -> new HashSet<>()) - .add(entity); - } - }); - + Map> hintsPerSectionNumber = getHintsPerSection(documentEntities, dictionary); Set foundByLocal = findEntities(classifiedDoc, container, manualRedactions, dictionary, true, hintsPerSectionNumber, fileAttributes); EntitySearchUtils.addEntitiesWithHigherRank(documentEntities, foundByLocal, dictionary); EntitySearchUtils.removeEntitiesContainedInLarger(documentEntities); } - for (Entity entity : documentEntities) { - Map> sequenceOnPage = new HashMap<>(); - for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { - sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) - .add(entityPositionSequence); - } - - for (Map.Entry> entry : sequenceOnPage.entrySet()) { - classifiedDoc.getEntities() - .computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) - .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry - .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity - .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity - .getStart(), entity.getEnd(), entity.isDossierDictionaryEntry())); - } - } + classifiedDoc.setEntities(convertToEnititesPerPage(documentEntities)); dictionaryService.updateExternalDictionary(dictionary, dossierTemplateId); @@ -80,6 +58,39 @@ public class EntityRedactionService { } + public Map> convertToEnititesPerPage(Set entities){ + Map> entitiesPerPage = new HashMap<>(); + for (Entity entity : entities) { + Map> sequenceOnPage = new HashMap<>(); + for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) + .add(entityPositionSequence); + } + + for (Map.Entry> entry : sequenceOnPage.entrySet()) { + entitiesPerPage + .computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) + .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry + .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity + .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity + .getStart(), entity.getEnd(), entity.isDossierDictionaryEntry())); + } + } + return entitiesPerPage; + } + + + public Map> getHintsPerSection(Set entities, Dictionary dictionary){ + Map> hintsPerSectionNumber = new HashMap<>(); + entities.stream().forEach(entity -> { + if (dictionary.isHint(entity.getType()) && entity.isDictionaryEntry()) { + hintsPerSectionNumber.computeIfAbsent(entity.getSectionNumber(), (x) -> new HashSet<>()) + .add(entity); + } + }); + return hintsPerSectionNumber; + } + private Set findEntities(Document classifiedDoc, KieContainer kieContainer, ManualRedactions manualRedactions, Dictionary dictionary, boolean local, Map> hintsPerSectionNumber, @@ -123,42 +134,46 @@ public class EntityRedactionService { } sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { - Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection()); - documentEntities.addAll(analysedRowSection.getEntities()); + Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection()); + documentEntities.addAll(analysedSection.getEntities()); - for (Image image : analysedRowSection.getImages()) { + for (Image image : analysedSection.getImages()) { classifiedDoc.getImages().computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); } - analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> { - if (dictionary.isRecommendation(key)) { - analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> { - if (!dictionary.containsValue(key, value)) { - dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value); - } - }); - } else { - analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> { - - if (dictionary.getLocalAccessMap().get(key) == null) { - log.warn("Dictionary {} is null", key); - } - - if (dictionary.getLocalAccessMap().get(key).getLocalEntries() == null) { - log.warn("Dictionary {} localEntries is null", key); - } - - dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value); - }); - } - }); - + addLocalValuesToDictionary(analysedSection, dictionary); }); return documentEntities; } + public void addLocalValuesToDictionary(Section analysedSection, Dictionary dictionary){ + analysedSection.getLocalDictionaryAdds().keySet().forEach(key -> { + if (dictionary.isRecommendation(key)) { + analysedSection.getLocalDictionaryAdds().get(key).forEach(value -> { + if (!dictionary.containsValue(key, value)) { + dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value); + } + }); + } else { + analysedSection.getLocalDictionaryAdds().get(key).forEach(value -> { + + if (dictionary.getLocalAccessMap().get(key) == null) { + log.warn("Dictionary {} is null", key); + } + + if (dictionary.getLocalAccessMap().get(key).getLocalEntries() == null) { + log.warn("Dictionary {} localEntries is null", key); + } + + dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value); + }); + } + }); + } + + private List processTablePerRow(Document classifiedDoc, Table table, AtomicInteger sectionNumber, Dictionary dictionary, boolean local, diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index 17dfdf4b..810619a1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -13,9 +13,11 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; + import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; + import org.kie.api.runtime.KieContainer; import org.springframework.stereotype.Service; import org.springframework.web.bind.annotation.RequestBody; @@ -60,8 +62,7 @@ public class ReanalyzeService { entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getDossierTemplateId(), analyzeRequest.getManualRedactions(), analyzeRequest .getDossierId(), analyzeRequest.getFileAttributes()); - redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest - .getDossierTemplateId()); + redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getDossierTemplateId()); log.info("Redaction analysis successful..."); @@ -105,6 +106,59 @@ public class ReanalyzeService { DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getDossierTemplateId(), new DictionaryVersion(redactionLog .getDictionaryVersion(), redactionLog.getDossierDictionaryVersion()), analyzeRequest.getDossierId()); + Set sectionsToReanalyse = findSectionsToReanalyse(dictionaryIncrement, redactionLog, text, analyzeRequest); + + if (sectionsToReanalyse.isEmpty()) { + return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement); + } + + List reanalysisSections = text.getSectionTexts() + .stream() + .filter(sectionText -> sectionsToReanalyse.contains(sectionText.getSectionNumber())) + .collect(Collectors.toList()); + + KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId()); + + Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest + .getDossierId()); + + Map> imagesPerPage = new HashMap<>(); + Set entities = findEntities(reanalysisSections, dictionary, kieContainer, analyzeRequest, false, null, imagesPerPage); + + if (dictionary.hasLocalEntries()) { + Map> hintsPerSectionNumber = entityRedactionService.getHintsPerSection(entities, dictionary); + Set foundByLocal = findEntities(reanalysisSections, dictionary, kieContainer, analyzeRequest, true, hintsPerSectionNumber, imagesPerPage); + EntitySearchUtils.addEntitiesWithHigherRank(entities, foundByLocal, dictionary); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); + } + + Map> entitiesPerPage = entityRedactionService.convertToEnititesPerPage(entities); + + List newRedactionLogEntries = new ArrayList<>(); + for (int page = 1; page <= text.getNumberOfPages(); page++) { + if (entitiesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, page, analyzeRequest + .getDossierTemplateId())); + } + + if (imagesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, page, analyzeRequest + .getDossierTemplateId())); + } + + } + + redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber())); + redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); + AnalyzeResult analyzeResult = finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement); + analyzeResult.setWasReanalyzed(true); + return analyzeResult; + } + + + private Set findSectionsToReanalyse(DictionaryIncrement dictionaryIncrement, RedactionLog redactionLog, + Text text, AnalyzeRequest analyzeRequest) { + Set relevantManuallyModifiedAnnotationIds = getRelevantManuallyModifiedAnnotationIds(analyzeRequest.getManualRedactions()); Set sectionsToReanalyse = new HashSet<>(); @@ -128,31 +182,20 @@ public class ReanalyzeService { log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest); - if (sectionsToReanalyse.isEmpty()) { - return finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement); - } + return sectionsToReanalyse; + } - List reanalysisSections = new ArrayList<>(); - for (SectionText sectionText : text.getSectionTexts()) { - - if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) { - reanalysisSections.add(sectionText); - } - } - - //-- - - KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getDossierTemplateId()); - - Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest - .getDossierId()); + private Set findEntities(List reanalysisSections, Dictionary dictionary, + KieContainer kieContainer, AnalyzeRequest analyzeRequest, boolean local, + Map> hintsPerSectionNumber, + Map> imagesPerPage) { List sectionSearchableTextPairs = new ArrayList<>(); for (SectionText reanalysisSection : reanalysisSections) { Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection - .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); + .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, local); if (reanalysisSection.getCellStarts() != null && !reanalysisSection.getCellStarts().isEmpty()) { surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection .getCellStarts()); @@ -160,14 +203,15 @@ public class ReanalyzeService { surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); } - if (reanalysisSection.getImages() != null && !reanalysisSection.getImages() + if (!local && reanalysisSection.getImages() != null && !reanalysisSection.getImages() .isEmpty() && analyzeRequest.getManualRedactions() != null && analyzeRequest.getManualRedactions() .getImageRecategorizations() != null) { for (Image image : reanalysisSection.getImages()) { String imageId = IdBuilder.buildId(image.getPosition(), image.getPage()); for (ManualImageRecategorization imageRecategorization : analyzeRequest.getManualRedactions() .getImageRecategorizations()) { - if (imageRecategorization.getStatus().equals(Status.APPROVED) && imageRecategorization.getId().equals(imageId)) { + if (imageRecategorization.getStatus().equals(Status.APPROVED) && imageRecategorization.getId() + .equals(imageId)) { image.setType(imageRecategorization.getType()); } } @@ -177,7 +221,10 @@ public class ReanalyzeService { sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() .isLocal(false) .dictionaryTypes(dictionary.getTypes()) - .entities(entities) + .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream + .concat(entities.stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber()) + .stream()) + .collect(Collectors.toSet()) : entities) .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) .searchText(reanalysisSection.getSearchableText().toString()) .headline(reanalysisSection.getHeadline()) @@ -191,54 +238,19 @@ public class ReanalyzeService { } Set entities = new HashSet<>(); - Map> imagesPerPage = new HashMap<>(); sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { - Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection()); - entities.addAll(analysedRowSection.getEntities()); + Section analysedSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection()); + entities.addAll(analysedSection.getEntities()); EntitySearchUtils.removeEntitiesContainedInLarger(entities); - for (Image image : analysedRowSection.getImages()) { + for (Image image : analysedSection.getImages()) { imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); } + entityRedactionService.addLocalValuesToDictionary(analysedSection, dictionary); }); - Map> entitiesPerPage = new HashMap<>(); - for (Entity entity : entities) { - Map> sequenceOnPage = new HashMap<>(); - for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { - sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) - .add(entityPositionSequence); - } - - for (Map.Entry> entry : sequenceOnPage.entrySet()) { - entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) - .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry - .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity - .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity - .getStart(), entity.getEnd(), entity.isDossierDictionaryEntry())); - } - } - - List newRedactionLogEntries = new ArrayList<>(); - for (int page = 1; page <= text.getNumberOfPages(); page++) { - if (entitiesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, page, analyzeRequest - .getDossierTemplateId())); - } - - if (imagesPerPage.get(page) != null) { - newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, page, analyzeRequest - .getDossierTemplateId())); - } - - } - - redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber())); - redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); - AnalyzeResult analyzeResult = finalizeAnalysis(analyzeRequest, startTime, redactionLog, text, dictionaryIncrement); - analyzeResult.setWasReanalyzed(true); - return analyzeResult; + return entities; } @@ -296,10 +308,9 @@ public class ReanalyzeService { private void excludeExcludedPages(RedactionLog redactionLog, Set excludedPages) { - redactionLog.getRedactionLogEntry().forEach(entry -> - entry.getPositions().forEach(pos -> - entry.setExcluded(excludedPages != null && excludedPages.contains(pos.getPage())) - )); + redactionLog.getRedactionLogEntry() + .forEach(entry -> entry.getPositions() + .forEach(pos -> entry.setExcluded(excludedPages != null && excludedPages.contains(pos.getPage())))); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 8904ea6c..6949ff40 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -18,7 +18,9 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import com.iqser.red.storage.commons.service.StorageService; + import lombok.SneakyThrows; + import org.apache.commons.io.IOUtils; import org.junit.After; import org.junit.Before; @@ -84,7 +86,6 @@ public class RedactionIntegrationTest { private static final String PII = "PII"; - @Autowired private RedactionController redactionController; @@ -127,6 +128,7 @@ public class RedactionIntegrationTest { private final Map rankTypeMap = new HashMap<>(); private final Colors colors = new Colors(); private final Map reanlysisVersions = new HashMap<>(); + private final Set deleted = new HashSet<>(); private final static String TEST_DOSSIER_TEMPLATE_ID = "123"; private final static String TEST_DOSSIER_ID = "123"; @@ -152,18 +154,20 @@ public class RedactionIntegrationTest { return kieServices.newKieContainer(kieModule.getReleaseId()); } + @Bean @Primary public StorageService inmemoryStorage() { + return new FileSystemBackedStorageService(); } - } @After public void cleanupStorage() { + if (this.storageService instanceof FileSystemBackedStorageService) { ((FileSystemBackedStorageService) this.storageService).clearStorage(); } @@ -179,7 +183,8 @@ public class RedactionIntegrationTest { loadDictionaryForTest(); loadTypeForTest(); when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(0L); - when(dictionaryClient.getAllTypes(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(TypeResponse.builder() + when(dictionaryClient.getAllTypes(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(TypeResponse + .builder() .types(getTypeResponse()) .build()); @@ -188,7 +193,7 @@ public class RedactionIntegrationTest { .types(List.of(TypeResult.builder() .type(DOSSIER_REDACTIONS) .dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID) - .hexColor( "#ffe187") + .hexColor("#ffe187") .isHint(hintTypeMap.get(DOSSIER_REDACTIONS)) .isCaseInsensitive(caseInSensitiveMap.get(DOSSIER_REDACTIONS)) .isRecommendation(recommendationTypeMap.get(DOSSIER_REDACTIONS)) @@ -196,26 +201,42 @@ public class RedactionIntegrationTest { .build())) .build()); - when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false)); - when(dictionaryClient.getDictionaryForType(ADDRESS, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(ADDRESS, false)); - when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(AUTHOR, false)); - when(dictionaryClient.getDictionaryForType(SPONSOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SPONSOR, false)); - when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR, false)); - when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR, false)); - when(dictionaryClient.getDictionaryForType(HINT_ONLY, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(HINT_ONLY, false)); - when(dictionaryClient.getDictionaryForType(MUST_REDACT, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(MUST_REDACT, false)); - when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION, false)); - when(dictionaryClient.getDictionaryForType(TEST_METHOD, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(TEST_METHOD, false)); + when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(VERTEBRATE, false)); + when(dictionaryClient.getDictionaryForType(ADDRESS, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(ADDRESS, false)); + when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(AUTHOR, false)); + when(dictionaryClient.getDictionaryForType(SPONSOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(SPONSOR, false)); + when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR, false)); + when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(REDACTION_INDICATOR, false)); + when(dictionaryClient.getDictionaryForType(HINT_ONLY, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(HINT_ONLY, false)); + when(dictionaryClient.getDictionaryForType(MUST_REDACT, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(MUST_REDACT, false)); + when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION, false)); + when(dictionaryClient.getDictionaryForType(TEST_METHOD, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(TEST_METHOD, false)); when(dictionaryClient.getDictionaryForType(PII, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PII, false)); - when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR, false)); - when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS, false)); - when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FALSE_POSITIVE, false)); - when(dictionaryClient.getDictionaryForType(PURITY, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(PURITY, false)); + when(dictionaryClient.getDictionaryForType(RECOMMENDATION_AUTHOR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(RECOMMENDATION_AUTHOR, false)); + when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS, false)); + when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(FALSE_POSITIVE, false)); + when(dictionaryClient.getDictionaryForType(PURITY, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(PURITY, false)); when(dictionaryClient.getDictionaryForType(IMAGE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(IMAGE, false)); when(dictionaryClient.getDictionaryForType(OCR, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(OCR, false)); when(dictionaryClient.getDictionaryForType(LOGO, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(LOGO, false)); - when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(SIGNATURE, false)); - when(dictionaryClient.getDictionaryForType(FORMULA, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(FORMULA, false)); + when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(SIGNATURE, false)); + when(dictionaryClient.getDictionaryForType(FORMULA, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(FORMULA, false)); when(dictionaryClient.getDictionaryForType(DOSSIER_REDACTIONS, TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID)).thenReturn(getDictionaryResponse(DOSSIER_REDACTIONS, true)); when(dictionaryClient.getColors(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(colors); } @@ -477,7 +498,8 @@ public class RedactionIntegrationTest { return DictionaryResponse.builder() .hexColor(typeColorMap.get(type)) - .entries(isDossierDictionary ? toDictionaryEntry(dossierDictionary.get(type)) : toDictionaryEntry(dictionary.get(type))) + .entries(isDossierDictionary ? toDictionaryEntry(dossierDictionary.get(type)) : toDictionaryEntry(dictionary + .get(type))) .isHint(hintTypeMap.get(type)) .isCaseInsensitive(caseInSensitiveMap.get(type)) .isRecommendation(recommendationTypeMap.get(type)) @@ -490,7 +512,8 @@ public class RedactionIntegrationTest { List dictionaryEntries = new ArrayList<>(); entries.forEach(entry -> { - dictionaryEntries.add(new DictionaryEntry(entry, reanlysisVersions.containsKey(entry) ? reanlysisVersions.get(entry) : 0L, false)); + dictionaryEntries.add(new DictionaryEntry(entry, reanlysisVersions.containsKey(entry) ? reanlysisVersions.get(entry) : 0L, deleted + .contains(entry) ? true : false)); }); return dictionaryEntries; } @@ -498,6 +521,7 @@ public class RedactionIntegrationTest { @Test public void test270Rotated() { + AnalyzeRequest request = prepareStorage("files/Minimal Examples/270Rotated.pdf"); MemoryStats.printMemoryStats(); AnalyzeResult result = reanalyzeService.analyze(request); @@ -508,12 +532,14 @@ public class RedactionIntegrationTest { @Test @Ignore public void testLargeScannedFileOOM() { + AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf"); MemoryStats.printMemoryStats(); AnalyzeResult result = reanalyzeService.analyze(request); assertThat(result).isNotNull(); } + @Test public void testMergedImages() throws IOException { @@ -552,14 +578,13 @@ public class RedactionIntegrationTest { long rend = System.currentTimeMillis(); System.out.println("reanalysis analysis duration: " + (rend - rstart)); - long end = System.currentTimeMillis(); System.out.println("duration: " + (end - start)); - } + @Test @Ignore public void noExceptionShouldBeThrownForAnyFiles() throws IOException { @@ -635,7 +660,12 @@ public class RedactionIntegrationTest { AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); request.setExcludedPages(Set.of(1)); - request.setFileAttributes(List.of(FileAttribute.builder().id("fileAttributeId").label("Vertebrate Study").placeholder("{fileattributes.vertebrateStudy}").value("true").build())); + request.setFileAttributes(List.of(FileAttribute.builder() + .id("fileAttributeId") + .label("Vertebrate Study") + .placeholder("{fileattributes.vertebrateStudy}") + .value("true") + .build())); AnalyzeResult result = reanalyzeService.analyze(request); @@ -683,12 +713,18 @@ public class RedactionIntegrationTest { dictionary.get(AUTHOR).add("physical"); reanlysisVersions.put("physical", 2L); -// dictionary.get(VERTEBRATE).add("s-metolachlor"); -// reanlysisVersions.put("s-metolachlor", 3L); + deleted.add("David Chubb"); + + dictionary.get(FALSE_POSITIVE).add("David Chubb"); + reanlysisVersions.put("David Chubb", 3L); when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(3L); - when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(getDictionaryResponse(VERTEBRATE, false)); + when(dictionaryClient.getDictionaryForType(VERTEBRATE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(VERTEBRATE, false)); + + when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)) + .thenReturn(getDictionaryResponse(FALSE_POSITIVE, false)); start = System.currentTimeMillis(); @@ -702,10 +738,8 @@ public class RedactionIntegrationTest { request.setManualRedactions(manualRedactions); - AnalyzeResult reanalyzeResult = reanalyzeService.reanalyze(request); - redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); end = System.currentTimeMillis(); @@ -775,7 +809,6 @@ public class RedactionIntegrationTest { .status(Status.APPROVED) .build())); - manualRedactions.getComments().put("e5be0f1d941bbb92a068e198648d06c4", List.of(comment)); manualRedactions.getComments().put("0836727c3508a0b2ea271da69c04cc2f", List.of(comment)); manualRedactions.getComments().put(manualAddId, List.of(comment)); @@ -790,12 +823,10 @@ public class RedactionIntegrationTest { // manualRedactions.getEntriesToAdd().add(manualRedactionEntry); - AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); request.setManualRedactions(manualRedactions); AnalyzeResult result = reanalyzeService.analyze(request); - manualRedactions.getEntriesToAdd().add(manualRedactionEntry); manualRedactions.setIdsToRemove(Set.of(IdRemoval.builder() .id("5b940b2cb401ed9f5be6fc24f6e77bcf") @@ -816,7 +847,6 @@ public class RedactionIntegrationTest { .fileId(TEST_FILE_ID) .build()); - try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) { fileOutputStream.write(annotateResponse.getDocument()); } @@ -833,7 +863,6 @@ public class RedactionIntegrationTest { System.out.println("classificationTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf"); - AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); RedactionRequest redactionRequest = RedactionRequest.builder() @@ -934,8 +963,10 @@ public class RedactionIntegrationTest { }); } + @SneakyThrows private AnalyzeRequest prepareStorage(String file) { + ClassPathResource pdfFileResource = new ClassPathResource(file); return prepareStorage(pdfFileResource.getInputStream()); @@ -967,7 +998,6 @@ public class RedactionIntegrationTest { long start = System.currentTimeMillis(); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/sponsor_companies.pdf"); - AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); AnalyzeResult result = reanalyzeService.analyze(request); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt index 766ebd06..3d53e6d2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt @@ -1676,7 +1676,6 @@ da Silva Rejane Das R Das, R. Daughtry, CST -David Chubb David Chubb|Lorraine Britton David Clarke Davies diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/false_positive.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/false_positive.txt index 0bb75f29..a696cee0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/false_positive.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/false_positive.txt @@ -235,4 +235,5 @@ N/A No details reported Not available Test facility -TBD \ No newline at end of file +TBD +David Chubb \ No newline at end of file