diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 9ba32611..e364ec34 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -131,11 +131,27 @@ public class Section { return StringUtils.containsIgnoreCase(headline, word); } + @ThenAction public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String pattern, @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group) { + expandByRegEx(type, pattern, patternCaseInsensitive, group, null); + } + + @ThenAction + public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, + @Argument(ArgumentType.REGEX) String pattern, + @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, + @Argument(ArgumentType.INTEGER) int group, + @Argument(ArgumentType.REGEX) String withoutPattern) { + + Pattern compiledWithoutPattern = null; + + if (withoutPattern != null) { + compiledWithoutPattern = Patterns.getCompiledPattern(withoutPattern, patternCaseInsensitive); + } Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); @@ -146,6 +162,13 @@ public class Section { continue; } + if(withoutPattern != null) { + Matcher matcherWithout = compiledWithoutPattern.matcher(entity.getWord()); + if (matcherWithout.find()) { + continue; + } + } + Matcher matcher = compiledPattern.matcher(entity.getTextAfter()); while (matcher.find()) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 6ab19860..05742bbf 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -1304,6 +1304,30 @@ public class RedactionIntegrationTest { } + @Test + public void testExpandByRegEx() throws IOException { + + System.out.println("expandByRegex"); + long start = System.currentTimeMillis(); + + AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); + analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); + AnalyzeResult result = analyzeService.analyze(request); + + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() + .dossierId(TEST_DOSSIER_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) { + fileOutputStream.write(annotateResponse.getDocument()); + } + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + System.out.println("numberOfPages: " + result.getNumberOfPages()); + } + private static String loadFromClassPath(String path) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl index 6d05e185..32621953 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl @@ -7,12 +7,20 @@ global Section section // --------------------------------------- CBI rules ------------------------------------------------------------------- +//rule "0: Expand CBI Authors with firstname initials" +// when +// Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) +// then +// section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); +// section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); +// end + rule "0: Expand CBI Authors with firstname initials" when Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) then - section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); - section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); + section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); + section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); end diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index bf9cb602..c0b96ff9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -7,12 +7,20 @@ global Section section // --------------------------------------- CBI rules ------------------------------------------------------------------- +//rule "0: Expand CBI Authors with firstname initials" +// when +// Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) +// then +// section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); +// section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); +// end + rule "0: Expand CBI Authors with firstname initials" when Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) then - section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); - section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); + section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); + section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); end