Pull request #278: RED-2841: INC6207970 Rule for initials expansion should be applied only to dictionary entries without whitespaces

Merge in RED/redaction-service from RED-2841-rs1 to master

* commit 'd1317c5bd4f9522eecd78775656a0f7b5a74c3d7':
  RED-2841: INC6207970 Rule for initials expansion should be applied only to dictionary entries without whitespaces
This commit is contained in:
Ali Oezyetimoglu 2021-11-29 15:40:51 +01:00 committed by Dominique Eiflaender
commit ad80d4247c
4 changed files with 67 additions and 4 deletions

View File

@ -131,11 +131,27 @@ public class Section {
return StringUtils.containsIgnoreCase(headline, word);
}
@ThenAction
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group) {
expandByRegEx(type, pattern, patternCaseInsensitive, group, null);
}
@ThenAction
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.REGEX) String withoutPattern) {
Pattern compiledWithoutPattern = null;
if (withoutPattern != null) {
compiledWithoutPattern = Patterns.getCompiledPattern(withoutPattern, patternCaseInsensitive);
}
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
@ -146,6 +162,13 @@ public class Section {
continue;
}
if(withoutPattern != null) {
Matcher matcherWithout = compiledWithoutPattern.matcher(entity.getWord());
if (matcherWithout.find()) {
continue;
}
}
Matcher matcher = compiledPattern.matcher(entity.getTextAfter());
while (matcher.find()) {

View File

@ -1304,6 +1304,30 @@ public class RedactionIntegrationTest {
}
@Test
public void testExpandByRegEx() throws IOException {
System.out.println("expandByRegex");
long start = System.currentTimeMillis();
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
long end = System.currentTimeMillis();
System.out.println("duration: " + (end - start));
System.out.println("numberOfPages: " + result.getNumberOfPages());
}
private static String loadFromClassPath(String path) {

View File

@ -7,12 +7,20 @@ global Section section
// --------------------------------------- CBI rules -------------------------------------------------------------------
//rule "0: Expand CBI Authors with firstname initials"
// when
// Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
// then
// section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
// section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
// end
rule "0: Expand CBI Authors with firstname initials"
when
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
then
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
end

View File

@ -7,12 +7,20 @@ global Section section
// --------------------------------------- CBI rules -------------------------------------------------------------------
//rule "0: Expand CBI Authors with firstname initials"
// when
// Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
// then
// section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
// section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
// end
rule "0: Expand CBI Authors with firstname initials"
when
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
then
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
end