Merge branch 'RED-7902' into 'master'

RED-7902: fix author name splitting

Closes RED-7902

See merge request redactmanager/redaction-service!202
This commit is contained in:
Kilian Schüttler 2023-11-21 11:22:54 +01:00
commit 79e76d688e
9 changed files with 163 additions and 107 deletions

View File

@ -2,23 +2,22 @@ package com.iqser.red.service.redaction.v1.server.model.dictionary;
import static java.lang.String.format;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException;
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.utils.Patterns;
import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException;
import lombok.Data;
import lombok.Getter;
@ -108,10 +107,14 @@ public class Dictionary {
throw new IllegalArgumentException(format("%s is not a valid dictionary entry", value));
}
Set<MatchedRule> matchedRulesSet = new HashSet<>(matchedRules);
localAccessMap.get(type).getLocalEntriesWithMatchedRules().merge(value.trim(), matchedRulesSet, (set1, set2) -> Stream.concat(set1.stream(), set2.stream()).collect(Collectors.toSet()));
localAccessMap.get(type)
.getLocalEntriesWithMatchedRules()
.merge(value.trim(), matchedRulesSet, (set1, set2) -> Stream.concat(set1.stream(), set2.stream()).collect(Collectors.toSet()));
if (alsoAddLastname) {
String lastname = value.split(" ")[0];
localAccessMap.get(type).getLocalEntriesWithMatchedRules().merge(lastname, matchedRulesSet, (set1, set2) -> Stream.concat(set1.stream(), set2.stream()).collect(Collectors.toSet()));
localAccessMap.get(type)
.getLocalEntriesWithMatchedRules()
.merge(lastname, matchedRulesSet, (set1, set2) -> Stream.concat(set1.stream(), set2.stream()).collect(Collectors.toSet()));
}
}
@ -130,16 +133,20 @@ public class Dictionary {
public void addMultipleAuthorsAsRecommendation(TextEntity textEntity) {
String cleanedWord = textEntity.getValue().replaceAll(",", " ").replaceAll(" ", " ").trim() + " ";
Pattern pattern = Patterns.AUTHOR_TABLE_SPLITTER;
Matcher matcher = pattern.matcher(cleanedWord);
splitIntoAuthorNames(textEntity).forEach(authorName -> addLocalDictionaryEntry(textEntity.getType(), authorName, textEntity.getMatchedRuleList(), true));
while (matcher.find()) {
String match = matcher.group().trim();
if (match.length() >= 3) {
addLocalDictionaryEntry(textEntity.getType(), match, textEntity.getMatchedRuleList(), true);
}
}
public static List<String> splitIntoAuthorNames(TextEntity textEntity) {
List<String> splitAuthorNames;
if (textEntity.getValue().contains(",")) {
splitAuthorNames = Arrays.asList(textEntity.getValue().split(","));
} else {
splitAuthorNames = Arrays.asList(textEntity.getValueWithLineBreaks().split("\n"));
}
return splitAuthorNames.stream().map(String::trim).filter(authorName -> Patterns.AUTHOR_NAME_PATTERN.matcher(authorName).matches()).toList();
}
}

View File

@ -11,8 +11,7 @@ public final class Patterns {
public static final Map<String, Pattern> patternCache = new HashMap<>();
public static final Pattern AUTHOR_TABLE_SPLITTER = Pattern.compile(
"(((((di)|(van)) )|[A-Z])?[A-ZÄÖÜ][\\wäöüéèê]{2,500}( ?[A-ZÄÖÜ]{1,2}\\.){1,3})|(((((di)|(van)) )|[A-Z])?[A-ZÄÖÜ][\\wäöüéèê]{2,500}( ?[A-ZÄÖÜ]{1,2} ){1,3})");
public static final Pattern AUTHOR_NAME_PATTERN = Pattern.compile("^(?!\\w*[.]$)\\p{L}+[.\\p{L}\\s]*$");
public Pattern getCompiledPattern(String pattern, boolean caseInsensitive) {

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.when;
@ -103,6 +104,25 @@ public class RedactionAcceptanceTest extends AbstractRedactionIntegrationTest {
}
@Test
public void authorNameSplittingTest() {
AnalyzeRequest request = uploadFileToStorage("files/Minimal Examples/tableWithAuthors.pdf");
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
AnalyzeResult result = analyzeService.analyze(request);
var entityLog = redactionStorageService.getEntityLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var recommendations = entityLog.getEntityLogEntry()
.stream()
.filter(entityLogEntry -> entityLogEntry.getEntryType().equals(EntryType.RECOMMENDATION))
.map(EntityLogEntry::getValue)
.toList();
assertThat(recommendations).containsExactlyInAnyOrder("Michael N.", "Funnarie B.", "Feuer A.");
}
@Test
public void acceptanceTests() throws IOException {
@ -148,7 +168,8 @@ public class RedactionAcceptanceTest extends AbstractRedactionIntegrationTest {
return redactionLog.getEntityLogEntry()
.stream()
.filter(entry -> entry.getType().equals(type))
.filter(entry -> entry.getValue().equals(value)).filter(entry -> entry.getContainingNodeId().get(0).equals(sectionNumber.get(0)));
.filter(entry -> entry.getValue().equals(value))
.filter(entry -> entry.getContainingNodeId().get(0).equals(sectionNumber.get(0)));
}

View File

@ -266,9 +266,11 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
@Test
public void importedRedactionsTest() throws IOException {
ClassPathResource importedRedactionClasspathResource = new ClassPathResource("files/ImportedRedactions/18 Chlorothalonil RAR 08 Volume 3CA B 6a Oct 2017.IMPORTED_REDACTIONS.json");
storageService.storeObject(TenantContext.getTenantId(), RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.IMPORTED_REDACTIONS), importedRedactionClasspathResource.getInputStream());
ClassPathResource importedRedactionClasspathResource = new ClassPathResource(
"files/ImportedRedactions/18 Chlorothalonil RAR 08 Volume 3CA B 6a Oct 2017.IMPORTED_REDACTIONS.json");
storageService.storeObject(TenantContext.getTenantId(),
RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.IMPORTED_REDACTIONS),
importedRedactionClasspathResource.getInputStream());
AnalyzeRequest request = uploadFileToStorage("files/ImportedRedactions/18 Chlorothalonil RAR 08 Volume 3CA B 6a Oct 2017.pdf");
System.out.println("Start Full integration test");

View File

@ -11,81 +11,21 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.kie.api.KieServices;
import org.kie.api.builder.KieBuilder;
import org.kie.api.builder.KieFileSystem;
import org.kie.api.builder.KieModule;
import org.kie.api.runtime.KieContainer;
import org.kie.api.runtime.KieSession;
import org.kie.internal.io.ResourceFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.AnnotationStatus;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.IdRemoval;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualForceRedaction;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualResizeRedaction;
import com.iqser.red.service.redaction.v1.server.document.graph.BuildDocumentIntegrationTest;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.document.entity.PositionOnPage;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
import com.iqser.red.service.redaction.v1.server.service.ManualChangesApplicationService;
import com.iqser.red.service.redaction.v1.server.service.document.EntityCreationService;
import com.iqser.red.service.redaction.v1.server.service.document.EntityEnrichmentService;
@Import(ManualChangesIntegrationTest.TestConfiguration.class)
public class ManualChangesIntegrationTest extends BuildDocumentIntegrationTest {
private static final String RULES = "drools/manual_redaction_rules.drl";
@Autowired
private EntityEnrichmentService entityEnrichmentService;
private EntityCreationService entityCreationService;
private KieSession kieSession;
@Qualifier("kieContainer")
@Autowired
private KieContainer kieContainer;
@Configuration
@Import(BuildDocumentIntegrationTest.TestConfiguration.class)
public static class TestConfiguration {
@Bean
public KieContainer kieContainer() {
KieServices kieServices = KieServices.Factory.get();
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
kieFileSystem.write(ResourceFactory.newClassPathResource(RULES));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll();
KieModule kieModule = kieBuilder.getKieModule();
return kieServices.newKieContainer(kieModule.getReleaseId());
}
}
@BeforeEach
public void createServices() {
kieSession = kieContainer.newKieSession();
entityCreationService = new EntityCreationService(entityEnrichmentService, kieSession);
ManualChangesApplicationService manualChangesApplicationService = new ManualChangesApplicationService(entityCreationService);
kieSession.setGlobal("manualChangesApplicationService", manualChangesApplicationService);
kieSession.setGlobal("entityCreationService", entityCreationService);
}
import com.iqser.red.service.redaction.v1.server.rules.RulesIntegrationTest;
public class ManualChangesIntegrationTest extends RulesIntegrationTest {
@Test
public void manualResizeRedactionTest() {
@ -108,11 +48,7 @@ public class ManualChangesIntegrationTest extends BuildDocumentIntegrationTest {
.updateDictionary(false)
.build();
kieSession.insert(document);
document.streamAllSubNodes().forEach(kieSession::insert);
kieSession.insert(manualResizeRedaction);
kieSession.fireAllRules();
kieSession.dispose();
doAnalysis(document, List.of(manualResizeRedaction));
assertEquals(biggerEntity.getTextRange(), entity.getTextRange());
assertEquals(biggerEntity.getDeepestFullyContainingNode(), entity.getDeepestFullyContainingNode());
@ -141,11 +77,7 @@ public class ManualChangesIntegrationTest extends BuildDocumentIntegrationTest {
.requestDate(OffsetDateTime.now())
.build();
kieSession.insert(manualForceRedaction);
kieSession.insert(document);
document.streamAllSubNodes().forEach(kieSession::insert);
kieSession.fireAllRules();
kieSession.dispose();
doAnalysis(document, List.of(manualForceRedaction));
assertEquals(Paragraph.class, entity.getDeepestFullyContainingNode().getClass());
assertFalse(entity.getIntersectingNodes().isEmpty());
@ -170,11 +102,7 @@ public class ManualChangesIntegrationTest extends BuildDocumentIntegrationTest {
String initialId = entity.getPositionsOnPagePerPage().get(0).getId();
IdRemoval idRemoval = IdRemoval.builder().annotationId(initialId).status(AnnotationStatus.APPROVED).requestDate(OffsetDateTime.now()).build();
kieSession.insert(document);
document.streamAllSubNodes().forEach(kieSession::insert);
kieSession.insert(idRemoval);
kieSession.fireAllRules();
kieSession.dispose();
doAnalysis(document, List.of(idRemoval));
assertEquals("David Ksenia", entity.getValue());
assertEquals(initialId, entity.getPositionsOnPagePerPage().get(0).getId());
@ -199,12 +127,7 @@ public class ManualChangesIntegrationTest extends BuildDocumentIntegrationTest {
.requestDate(OffsetDateTime.now())
.build();
kieSession.insert(document);
document.streamAllSubNodes().forEach(kieSession::insert);
kieSession.insert(idRemoval);
kieSession.insert(manualForceRedaction);
kieSession.fireAllRules();
kieSession.dispose();
doAnalysis(document, List.of(manualForceRedaction));
assertEquals(Paragraph.class, entity.getDeepestFullyContainingNode().getClass());
assertFalse(entity.getIntersectingNodes().isEmpty());
@ -227,11 +150,7 @@ public class ManualChangesIntegrationTest extends BuildDocumentIntegrationTest {
String initialId = entity.getPositionsOnPagePerPage().get(0).getId();
IdRemoval idRemoval = IdRemoval.builder().annotationId(initialId).status(AnnotationStatus.REQUESTED).build();
kieSession.insert(idRemoval);
kieSession.insert(document);
document.streamAllSubNodes().forEach(kieSession::insert);
kieSession.fireAllRules();
kieSession.dispose();
doAnalysis(document, List.of(idRemoval));
assertEquals(Paragraph.class, entity.getDeepestFullyContainingNode().getClass());
assertFalse(entity.getIntersectingNodes().isEmpty());

View File

@ -0,0 +1,28 @@
package com.iqser.red.service.redaction.v1.server.rules;
import static org.assertj.core.api.Assertions.assertThat;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
public class Cbi11Test extends RulesIntegrationTest {
@Test
public void multiAuthorNameSplittingTest() {
Document document = buildGraph("files/Minimal Examples/multipleAuthorsInTable.pdf");
doAnalysis(document, Collections.emptyList());
List<String> authorNames = document.getEntities().stream().map(Dictionary::splitIntoAuthorNames).flatMap(Collection::stream).toList();
assertThat(authorNames).containsExactlyInAnyOrder("Cargile", "N.L.", "Ross", "J.A.", "Egli", "Ramsteiner");
}
}

View File

@ -0,0 +1,80 @@
package com.iqser.red.service.redaction.v1.server.rules;
import java.util.Collection;
import org.junit.jupiter.api.BeforeEach;
import org.kie.api.KieServices;
import org.kie.api.builder.KieBuilder;
import org.kie.api.builder.KieFileSystem;
import org.kie.api.builder.KieModule;
import org.kie.api.runtime.KieContainer;
import org.kie.api.runtime.KieSession;
import org.kie.internal.io.ResourceFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import com.iqser.red.service.redaction.v1.server.document.graph.BuildDocumentIntegrationTest;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.service.ManualChangesApplicationService;
import com.iqser.red.service.redaction.v1.server.service.document.EntityCreationService;
import com.iqser.red.service.redaction.v1.server.service.document.EntityEnrichmentService;
public class RulesIntegrationTest extends BuildDocumentIntegrationTest {
protected static final String RULES = "drools/rules.drl";
@Autowired
protected EntityEnrichmentService entityEnrichmentService;
protected EntityCreationService entityCreationService;
protected KieSession kieSession;
@Qualifier("kieContainer")
@Autowired
private KieContainer kieContainer;
protected void doAnalysis(Document document, Collection<Object> objectToInsert) {
kieSession.insert(document);
document.streamAllSubNodes().forEach(kieSession::insert);
objectToInsert.forEach(kieSession::insert);
kieSession.fireAllRules();
kieSession.dispose();
}
@Configuration
@Import(BuildDocumentIntegrationTest.TestConfiguration.class)
public static class TestConfiguration {
@Bean
public KieContainer kieContainer() {
KieServices kieServices = KieServices.Factory.get();
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
kieFileSystem.write(ResourceFactory.newClassPathResource(RULES));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll();
KieModule kieModule = kieBuilder.getKieModule();
return kieServices.newKieContainer(kieModule.getReleaseId());
}
}
@BeforeEach
public void createServices() {
kieSession = kieContainer.newKieSession();
entityCreationService = new EntityCreationService(entityEnrichmentService, kieSession);
ManualChangesApplicationService manualChangesApplicationService = new ManualChangesApplicationService(entityCreationService);
kieSession.setGlobal("manualChangesApplicationService", manualChangesApplicationService);
kieSession.setGlobal("entityCreationService", entityCreationService);
}
}