From 1431727f0ecc006dfde0ea8a645b1c9829d3c442 Mon Sep 17 00:00:00 2001 From: aoezyetimoglu Date: Mon, 29 Nov 2021 18:49:40 +0100 Subject: [PATCH] RED-2841: PORT - INC6207970 Rule for initials expansion should be applied only to dictionary entries without whitespaces --- .../v1/server/redaction/model/Section.java | 17 +++++++++++++ .../v1/server/RedactionIntegrationTest.java | 24 +++++++++++++++++++ .../src/test/resources/drools/allAuthors.drl | 12 ++++++++-- .../src/test/resources/drools/rules.drl | 12 ++++++++-- 4 files changed, 61 insertions(+), 4 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 17209efb..9c909437 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -121,6 +121,16 @@ public class Section { public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group) { + expandByRegEx(type, pattern, patternCaseInsensitive, group, null); + } + + + public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group, String withoutPattern) { + + Pattern compiledWithoutPattern = null; + if (withoutPattern != null) { + compiledWithoutPattern = Patterns.getCompiledPattern(withoutPattern, patternCaseInsensitive); + } Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); @@ -131,6 +141,13 @@ public class Section { continue; } + if(withoutPattern != null) { + Matcher matcherWithout = compiledWithoutPattern.matcher(entity.getWord()); + if (matcherWithout.find()) { + continue; + } + } + Matcher matcher = compiledPattern.matcher(entity.getTextAfter()); while (matcher.find()) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 8af6fa0b..b88f67f1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -1017,6 +1017,30 @@ public class RedactionIntegrationTest { } + @Test + public void testExpandByRegEx() throws IOException { + + System.out.println("expandByRegex"); + long start = System.currentTimeMillis(); + + AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); + AnalyzeResult result = reanalyzeService.analyze(request); + + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() + .dossierId(TEST_DOSSIER_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) { + fileOutputStream.write(annotateResponse.getDocument()); + } + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + System.out.println("numberOfPages: " + result.getNumberOfPages()); + } + + private static String loadFromClassPath(String path) { URL resource = ResourceLoader.class.getClassLoader().getResource(path); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl index 6d05e185..32621953 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl @@ -7,12 +7,20 @@ global Section section // --------------------------------------- CBI rules ------------------------------------------------------------------- +//rule "0: Expand CBI Authors with firstname initials" +// when +// Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) +// then +// section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); +// section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); +// end + rule "0: Expand CBI Authors with firstname initials" when Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) then - section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); - section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); + section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); + section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); end diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 853d7fac..00a81a21 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -7,12 +7,20 @@ global Section section // --------------------------------------- CBI rules ------------------------------------------------------------------- +//rule "0: Expand CBI Authors with firstname initials" +// when +// Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) +// then +// section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); +// section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); +// end + rule "0: Expand CBI Authors with firstname initials" when Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) then - section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); - section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); + section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); + section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); end