Pull request #178: RED-1445: Enabled to use file attributes in rules

Merge in RED/redaction-service from RED-1445 to master

* commit '6ae6d467fc5fec4b385cde1c70bae7a21143342f':
  RED-1445: Enabled to use file attributes in rules
This commit is contained in:
Dominique Eiflaender 2021-06-29 10:29:17 +02:00
commit 4c2eda02cc
8 changed files with 78 additions and 25 deletions

View File

@ -6,6 +6,10 @@ import lombok.Data;
import lombok.NoArgsConstructor;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
@Data
@ -22,5 +26,8 @@ public class AnalyzeRequest {
private OffsetDateTime lastProcessed;
private Set<Integer> excludedPages;
@Builder.Default
private List<FileAttribute> fileAttributes = new ArrayList<>();
}

View File

@ -0,0 +1,19 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class FileAttribute {
private String id;
private String label;
private String placeholder;
private String value;
}

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
@ -8,9 +9,11 @@ import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
@ -52,6 +55,22 @@ public class Section {
@Builder.Default
private Set<Image> images = new HashSet<>();
@Builder.Default
private List<FileAttribute> fileAttributes = new ArrayList<>();
public boolean fileAttributeByIdEquals(String id, String value){
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())).findFirst().isPresent();
}
public boolean fileAttributeByPlaceholderEquals(String placeholder, String value){
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())).findFirst().isPresent();
}
public boolean fileAttributeByLabelEquals(String label, String value){
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())).findFirst().isPresent();
}
public boolean rowEquals(String headerName, String value) {

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
@ -33,7 +34,7 @@ public class EntityRedactionService {
private final SurroundingWordsService surroundingWordsService;
public void processDocument(Document classifiedDoc, String dossierTemplateId, ManualRedactions manualRedactions, String dossierId) {
public void processDocument(Document classifiedDoc, String dossierTemplateId, ManualRedactions manualRedactions, String dossierId, List<FileAttribute> fileAttributes) {
dictionaryService.updateDictionary(dossierTemplateId, dossierId);
KieContainer container = droolsExecutionService.updateRules(dossierTemplateId);
@ -41,7 +42,7 @@ public class EntityRedactionService {
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(dossierTemplateId, dossierId);
Set<Entity> documentEntities = new HashSet<>(findEntities(classifiedDoc, container, manualRedactions, dictionary, false, null));
Set<Entity> documentEntities = new HashSet<>(findEntities(classifiedDoc, container, manualRedactions, dictionary, false, null, fileAttributes));
if (dictionary.hasLocalEntries()) {
@ -53,7 +54,7 @@ public class EntityRedactionService {
}
});
Set<Entity> foundByLocal = findEntities(classifiedDoc, container, manualRedactions, dictionary, true, hintsPerSectionNumber);
Set<Entity> foundByLocal = findEntities(classifiedDoc, container, manualRedactions, dictionary, true, hintsPerSectionNumber, fileAttributes);
EntitySearchUtils.addEntitiesWithHigherRank(documentEntities, foundByLocal, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(documentEntities);
}
@ -84,7 +85,7 @@ public class EntityRedactionService {
private Set<Entity> findEntities(Document classifiedDoc, KieContainer kieContainer,
ManualRedactions manualRedactions, Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
Map<Integer, Set<Entity>> hintsPerSectionNumber, List<FileAttribute> fileAttributes) {
Set<Entity> documentEntities = new HashSet<>();
@ -95,31 +96,31 @@ public class EntityRedactionService {
List<Table> tables = paragraph.getTables();
for (Table table : tables) {
if (table.getColCount() == 2) {
sectionSearchableTextPairs.addAll(processTableAsOneText(classifiedDoc, table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
sectionSearchableTextPairs.addAll(processTableAsOneText(classifiedDoc, table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, fileAttributes));
} else {
sectionSearchableTextPairs.addAll(processTablePerRow(classifiedDoc, table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
sectionSearchableTextPairs.addAll(processTablePerRow(classifiedDoc, table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, fileAttributes));
}
sectionNumber.incrementAndGet();
}
sectionSearchableTextPairs.add(processText(classifiedDoc, paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph
.getHeadline(), manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, paragraph
.getImages()));
.getImages(), fileAttributes));
sectionNumber.incrementAndGet();
}
for (Header header : classifiedDoc.getHeaders()) {
sectionSearchableTextPairs.add(processText(classifiedDoc, header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>()));
sectionSearchableTextPairs.add(processText(classifiedDoc, header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>(), fileAttributes));
sectionNumber.incrementAndGet();
}
for (Footer footer : classifiedDoc.getFooters()) {
sectionSearchableTextPairs.add(processText(classifiedDoc, footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>()));
sectionSearchableTextPairs.add(processText(classifiedDoc, footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>(), fileAttributes));
sectionNumber.incrementAndGet();
}
for (UnclassifiedText unclassifiedText : classifiedDoc.getUnclassifiedTexts()) {
sectionSearchableTextPairs.add(processText(classifiedDoc, unclassifiedText.getSearchableText(), unclassifiedText
.getTextBlocks(), "", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>()));
.getTextBlocks(), "", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>(), fileAttributes));
sectionNumber.incrementAndGet();
}
@ -164,7 +165,7 @@ public class EntityRedactionService {
ManualRedactions manualRedactions,
AtomicInteger sectionNumber, Dictionary dictionary,
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
Map<Integer, Set<Entity>> hintsPerSectionNumber, List<FileAttribute> fileAttributes) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
@ -229,6 +230,7 @@ public class EntityRedactionService {
.tabularData(tabularData)
.searchableText(searchableRow)
.dictionary(dictionary)
.fileAttributes(fileAttributes)
.build(), searchableRow));
if (!local) {
@ -252,7 +254,8 @@ public class EntityRedactionService {
ManualRedactions manualRedactions,
AtomicInteger sectionNumber, Dictionary dictionary,
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
Map<Integer, Set<Entity>> hintsPerSectionNumber,
List<FileAttribute> fileAttributes) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
SearchableText entireTableText = new SearchableText();
@ -296,6 +299,7 @@ public class EntityRedactionService {
.sectionNumber(sectionNumber.intValue())
.searchableText(entireTableText)
.dictionary(dictionary)
.fileAttributes(fileAttributes)
.build(), entireTableText));
if (!local) {
@ -315,7 +319,7 @@ public class EntityRedactionService {
ManualRedactions manualRedactions, AtomicInteger sectionNumber,
Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber,
List<PdfImage> images) {
List<PdfImage> images, List<FileAttribute> fileAttributes) {
if (!local) {
SectionText sectionText = new SectionText();
@ -355,6 +359,7 @@ public class EntityRedactionService {
.images(images.stream()
.map(image -> convert(image, sectionNumber.intValue(), headline))
.collect(Collectors.toSet()))
.fileAttributes(fileAttributes)
.build(), searchableText);
}

View File

@ -84,7 +84,7 @@ public class ReanalyzeService {
log.info("Document structure analysis successful, starting redaction analysis...");
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getDossierTemplateId(), analyzeRequest.getManualRedactions(), analyzeRequest
.getDossierId());
.getDossierId(), analyzeRequest.getFileAttributes());
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
.getDossierTemplateId());
@ -217,6 +217,7 @@ public class ReanalyzeService {
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.fileAttributes(analyzeRequest.getFileAttributes())
.build(), reanalysisSection.getSearchableText()));
}

View File

@ -635,6 +635,8 @@ public class RedactionIntegrationTest {
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setExcludedPages(Set.of(1));
request.setFileAttributes(List.of(FileAttribute.builder().id("fileAttributeId").label("Vertebrate Study").placeholder("{fileattributes.vertebrateStudy}").value("true").build()));
AnalyzeResult result = reanalyzeService.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);

View File

@ -151,7 +151,7 @@ public class EntityRedactionServiceTest {
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@ -177,7 +177,7 @@ public class EntityRedactionServiceTest {
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@ -202,7 +202,7 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
@ -210,7 +210,7 @@ public class EntityRedactionServiceTest {
pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " +
"the plant protection product.pdf");
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
@ -235,7 +235,7 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 9)
@ -302,7 +302,7 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 6)
@ -341,7 +341,7 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(dictionaryResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 11)
@ -371,7 +371,7 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
@ -390,7 +390,7 @@ public class EntityRedactionServiceTest {
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(addressResponse);
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
@ -419,7 +419,7 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(sponsorResponse);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId");
entityRedactionService.processDocument(classifiedDoc, TEST_DOSSIER_TEMPLATE_ID, null, "dossierId", null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
}

View File

@ -268,7 +268,7 @@ rule "18: Redact contact information if Producer is found"
rule "19: Redact AUTHOR(S)"
when
Section(searchText.contains("AUTHOR(S):"))
Section(searchText.contains("AUTHOR(S):") && fileAttributeByPlaceholderEquals("{fileattributes.vertebrateStudy}", "true"))
then
section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 19, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
end