diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index f96608c2..17a5fad6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -98,7 +98,8 @@ public class Section { } } - EntitySearchUtils.addEntitiesWithHigherRank(entities, finalResult, dictionary); + var nonOverlapping = EntitySearchUtils.findNonOverlappingMatchEntities(entities, finalResult); + EntitySearchUtils.addEntitiesWithHigherRank(entities, nonOverlapping, dictionary); EntitySearchUtils.removeEntitiesContainedInLarger(entities); nerEntities.removeAll(entitiesOfType); } @@ -162,7 +163,8 @@ public class Section { } if (!found.isEmpty()) { - EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); + var nonOverlapping = EntitySearchUtils.findNonOverlappingMatchEntities(entities, found); + EntitySearchUtils.addEntitiesWithHigherRank(entities, nonOverlapping, dictionary); EntitySearchUtils.removeEntitiesContainedInLarger(entities); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 8c2c5abe..159d1383 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -620,9 +620,13 @@ public class RedactionIntegrationTest { @Test public void redactionTest() throws IOException { - String fileName = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String fileName = "files/new/crafted document.pdf"; String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf"; + ClassPathResource responseJson = new ClassPathResource("files/crafted_document.NER_ENTITIES.json"); + var bytes = IOUtils.toByteArray(responseJson.getInputStream()); + storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.NER_ENTITIES), bytes); + long start = System.currentTimeMillis(); ClassPathResource pdfFileResource = new ClassPathResource(fileName); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/application.yml b/redaction-service-v1/redaction-service-server-v1/src/test/resources/application.yml index 2c4ff414..613880d8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/application.yml +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/application.yml @@ -17,7 +17,7 @@ platform.multi-tenancy: redaction-service: enable-image-classification: false - enable-entity-recognition: false + enable-entity-recognition: true storage: signer-type: 'AWSS3V4SignerType' diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt index 6c81517d..f3fd7713 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt @@ -3,3 +3,9 @@ Report Number: 33168 Page Report Number: BFI0714 Tesh Consultants International +B. Rahim +C. J. Alfred +Naka-27 Aomachi, Nomi, Ishikawa 923-1101, Japan, JP +Özgür U. Reyhan +Sude Halide Nurullah +Xinyi Y. Tao diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/crafted_document.NER_ENTITIES.json b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/crafted_document.NER_ENTITIES.json new file mode 100644 index 00000000..1d2efd11 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/crafted_document.NER_ENTITIES.json @@ -0,0 +1,872 @@ +{ + "result": { + "1": [ + { + "value": "Lastname, J.", + "startOffset": 54, + "endOffset": 66, + "type": "CBI_author" + }, + { + "value": "Doe, M.", + "startOffset": 67, + "endOffset": 74, + "type": "CBI_author" + }, + { + "value": "Mustermann Lastname M.", + "startOffset": 75, + "endOffset": 97, + "type": "CBI_author" + }, + { + "value": "Doe J. Mustermann M.", + "startOffset": 99, + "endOffset": 119, + "type": "CBI_author" + } + ], + "2": [ + { + "value": "Eikenboom Charalampos", + "startOffset": 148, + "endOffset": 169, + "type": "ORG" + }, + { + "value": "Schenk Tanja Schmitt ←", + "startOffset": 170, + "endOffset": 192, + "type": "ORG" + } + ], + "3": [ + { + "value": "Rue Jean Baffier", + "startOffset": 214, + "endOffset": 230, + "type": "CBI_author" + }, + { + "value": "7232", + "startOffset": 155, + "endOffset": 159, + "type": "POSTAL" + }, + { + "value": "CX", + "startOffset": 160, + "endOffset": 162, + "type": "COUNTRY" + }, + { + "value": "Warnsveld", + "startOffset": 163, + "endOffset": 172, + "type": "CITY" + }, + { + "value": "Netherlands", + "startOffset": 174, + "endOffset": 185, + "type": "COUNTRY" + }, + { + "value": "NL", + "startOffset": 187, + "endOffset": 189, + "type": "COUNTRY" + }, + { + "value": "Institut Industries", + "startOffset": 190, + "endOffset": 209, + "type": "ORG" + }, + { + "value": "33", + "startOffset": 211, + "endOffset": 213, + "type": "CARDINAL" + }, + { + "value": "Rue Jean Baffier", + "startOffset": 214, + "endOffset": 230, + "type": "STREET" + }, + { + "value": "18000", + "startOffset": 232, + "endOffset": 237, + "type": "CARDINAL" + }, + { + "value": "Bourges", + "startOffset": 238, + "endOffset": 245, + "type": "CITY" + }, + { + "value": "France", + "startOffset": 247, + "endOffset": 253, + "type": "COUNTRY" + }, + { + "value": "18300", + "startOffset": 282, + "endOffset": 287, + "type": "CARDINAL" + }, + { + "value": "Saint-Satur", + "startOffset": 288, + "endOffset": 299, + "type": "CITY" + }, + { + "value": "France", + "startOffset": 301, + "endOffset": 307, + "type": "COUNTRY" + }, + { + "value": "Lesdo Industries", + "startOffset": 312, + "endOffset": 328, + "type": "ORG" + }, + { + "value": "Chäppelisträssli", + "startOffset": 330, + "endOffset": 346, + "type": "ORG" + }, + { + "value": "6078", + "startOffset": 348, + "endOffset": 352, + "type": "POSTAL" + }, + { + "value": "Lungern", + "startOffset": 353, + "endOffset": 360, + "type": "STREET" + }, + { + "value": "Switzerland", + "startOffset": 362, + "endOffset": 373, + "type": "COUNTRY" + }, + { + "value": "Shlissel'burgskaya Ulitsa", + "startOffset": 374, + "endOffset": 399, + "type": "ORG" + }, + { + "value": "Nizhny Novgorod Oblast", + "startOffset": 401, + "endOffset": 423, + "type": "STREET" + }, + { + "value": "Russia", + "startOffset": 425, + "endOffset": 431, + "type": "CITY" + }, + { + "value": "603034", + "startOffset": 433, + "endOffset": 439, + "type": "POSTAL" + }, + { + "value": "RU", + "startOffset": 441, + "endOffset": 443, + "type": "COUNTRY" + }, + { + "value": "Karl Johans Gate", + "startOffset": 444, + "endOffset": 460, + "type": "STREET" + }, + { + "value": "11", + "startOffset": 461, + "endOffset": 463, + "type": "CARDINAL" + }, + { + "value": "0154", + "startOffset": 465, + "endOffset": 469, + "type": "POSTAL" + }, + { + "value": "Oslo", + "startOffset": 470, + "endOffset": 474, + "type": "CITY" + }, + { + "value": "Norway", + "startOffset": 476, + "endOffset": 482, + "type": "COUNTRY" + } + ], + "4": [ + { + "value": "Expand", + "startOffset": 67, + "endOffset": 73, + "type": "STATE" + }, + { + "value": "Hint Clarissa", + "startOffset": 77, + "endOffset": 90, + "type": "ORG" + }, + { + "value": "Dict", + "startOffset": 114, + "endOffset": 118, + "type": "ORG" + }, + { + "value": "Authors-Dict", + "startOffset": 171, + "endOffset": 183, + "type": "ORG" + } + ], + "6": [ + { + "value": "Michael N.", + "startOffset": 7, + "endOffset": 17, + "type": "CBI_author" + }, + { + "value": "Weyland Industries", + "startOffset": 76, + "endOffset": 94, + "type": "ORG" + } + ], + "7": [ + { + "value": "Funnarie B.", + "startOffset": 7, + "endOffset": 18, + "type": "CBI_author" + }, + { + "value": "Authentic Diagnostics", + "startOffset": 73, + "endOffset": 94, + "type": "ORG" + } + ], + "8": [ + { + "value": "Feuer A.", + "startOffset": 6, + "endOffset": 14, + "type": "CBI_author" + }, + { + "value": "Tyrell Corporation", + "startOffset": 74, + "endOffset": 92, + "type": "ORG" + } + ], + "12": [ + { + "value": "Melanie", + "startOffset": 292, + "endOffset": 299, + "type": "CBI_author" + } + ], + "13": [ + { + "value": "Stark Industries", + "startOffset": 184, + "endOffset": 200, + "type": "ORG" + } + ], + "14": [ + { + "value": "Omni Consumer Products Do", + "startOffset": 197, + "endOffset": 222, + "type": "ORG" + } + ], + "15": [ + { + "value": "Omni Consumer Products Do", + "startOffset": 122, + "endOffset": 147, + "type": "ORG" + } + ], + "16": [ + { + "value": "Asya Lyon", + "startOffset": 253, + "endOffset": 262, + "type": "CBI_author" + }, + { + "value": "Carina Madsen", + "startOffset": 264, + "endOffset": 277, + "type": "CBI_author" + }, + { + "value": "Alexandra Häusler", + "startOffset": 279, + "endOffset": 296, + "type": "CBI_author" + }, + { + "value": "Hanke Mendel", + "startOffset": 298, + "endOffset": 310, + "type": "CBI_author" + }, + { + "value": "Kwok, Jun K.", + "startOffset": 444, + "endOffset": 456, + "type": "CBI_author" + }, + { + "value": "Tu Wong", + "startOffset": 458, + "endOffset": 465, + "type": "CBI_author" + }, + { + "value": "Qiang Suen", + "startOffset": 467, + "endOffset": 477, + "type": "CBI_author" + }, + { + "value": "Zhou Mah", + "startOffset": 479, + "endOffset": 487, + "type": "CBI_author" + }, + { + "value": "Lei W. Huang", + "startOffset": 499, + "endOffset": 511, + "type": "CBI_author" + }, + { + "value": "Ru X.", + "startOffset": 513, + "endOffset": 518, + "type": "CBI_author" + }, + { + "value": "Oxford University Press", + "startOffset": 166, + "endOffset": 189, + "type": "ORG" + }, + { + "value": "Iakovos Geiger", + "startOffset": 222, + "endOffset": 236, + "type": "ORG" + }, + { + "value": "Julian Ritter", + "startOffset": 238, + "endOffset": 251, + "type": "CITY" + }, + { + "value": "Asya Lyon", + "startOffset": 253, + "endOffset": 262, + "type": "ORG" + }, + { + "value": "Carina Madsen", + "startOffset": 264, + "endOffset": 277, + "type": "CITY" + }, + { + "value": "Alexandra Häusler", + "startOffset": 279, + "endOffset": 296, + "type": "ORG" + }, + { + "value": "Hanke Mendel", + "startOffset": 298, + "endOffset": 310, + "type": "ORG" + }, + { + "value": "Ranya", + "startOffset": 312, + "endOffset": 317, + "type": "COUNTRY" + }, + { + "value": "Eikenboom", + "startOffset": 318, + "endOffset": 327, + "type": "ORG" + }, + { + "value": "Min Kwok", + "startOffset": 440, + "endOffset": 448, + "type": "ORG" + }, + { + "value": "Qiang Suen", + "startOffset": 467, + "endOffset": 477, + "type": "CITY" + }, + { + "value": "Zhou Mah", + "startOffset": 479, + "endOffset": 487, + "type": "ORG" + }, + { + "value": "Ning Liu", + "startOffset": 489, + "endOffset": 497, + "type": "STREET" + }, + { + "value": "Lei W. Huang, Ru X. Wu", + "startOffset": 499, + "endOffset": 521, + "type": "ORG" + } + ], + "17": [ + { + "value": "Nurullah Özgür", + "startOffset": 210, + "endOffset": 224, + "type": "CBI_author" + }, + { + "value": "Reyhan B.", + "startOffset": 228, + "endOffset": 237, + "type": "CBI_author" + }, + { + "value": "Alfred Xinyi Y.", + "startOffset": 250, + "endOffset": 265, + "type": "CBI_author" + }, + { + "value": "Redacted", + "startOffset": 12, + "endOffset": 20, + "type": "ORG" + }, + { + "value": "Aomachi", + "startOffset": 154, + "endOffset": 161, + "type": "ORG" + }, + { + "value": "Nomi", + "startOffset": 163, + "endOffset": 167, + "type": "ORG" + }, + { + "value": "Ishikawa", + "startOffset": 169, + "endOffset": 177, + "type": "CITY" + }, + { + "value": "923-1101", + "startOffset": 178, + "endOffset": 186, + "type": "CARDINAL" + }, + { + "value": "Japan", + "startOffset": 188, + "endOffset": 193, + "type": "COUNTRY" + }, + { + "value": "JP", + "startOffset": 195, + "endOffset": 197, + "type": "COUNTRY" + }, + { + "value": "Sude Halide Nurullah Özgür U. Reyhan B. Rahim C. J. Alfred Xinyi Y. Tao Clara Siegfried", + "startOffset": 198, + "endOffset": 285, + "type": "ORG" + }, + { + "value": "Dict", + "startOffset": 301, + "endOffset": 305, + "type": "ORG" + } + ], + "18": [ + { + "value": "Redact Emails", + "startOffset": 12, + "endOffset": 25, + "type": "ORG" + } + ], + "19": [ + { + "value": "Central Research Industry", + "startOffset": 274, + "endOffset": 299, + "type": "ORG" + }, + { + "value": "Maximiliam Schmitt", + "startOffset": 530, + "endOffset": 548, + "type": "ORG" + }, + { + "value": "European Central Institute", + "startOffset": 727, + "endOffset": 753, + "type": "ORG" + }, + { + "value": "Emilia Lockhart Alternative", + "startOffset": 775, + "endOffset": 802, + "type": "ORG" + }, + { + "value": "Cyberdyne Systems Tower", + "startOffset": 812, + "endOffset": 835, + "type": "ORG" + }, + { + "value": "121a", + "startOffset": 844, + "endOffset": 848, + "type": "CARDINAL" + }, + { + "value": "Hong Kong", + "startOffset": 849, + "endOffset": 858, + "type": "COUNTRY" + }, + { + "value": "BT", + "startOffset": 860, + "endOffset": 862, + "type": "COUNTRY" + } + ], + "21": [ + { + "value": "Central Research Industry", + "startOffset": 270, + "endOffset": 295, + "type": "ORG" + }, + { + "value": "Maximiliam Schmitt", + "startOffset": 526, + "endOffset": 544, + "type": "ORG" + }, + { + "value": "European Central Institute", + "startOffset": 723, + "endOffset": 749, + "type": "ORG" + }, + { + "value": "Emilia Lockhart Alternative", + "startOffset": 771, + "endOffset": 798, + "type": "ORG" + }, + { + "value": "Cyberdyne Systems Tower", + "startOffset": 808, + "endOffset": 831, + "type": "ORG" + }, + { + "value": "121a", + "startOffset": 840, + "endOffset": 844, + "type": "CARDINAL" + }, + { + "value": "Hong Kong", + "startOffset": 845, + "endOffset": 854, + "type": "COUNTRY" + }, + { + "value": "BT", + "startOffset": 856, + "endOffset": 858, + "type": "COUNTRY" + } + ], + "22": [ + { + "value": "Riddley", + "startOffset": 236, + "endOffset": 243, + "type": "ORG" + }, + { + "value": "359-21", + "startOffset": 259, + "endOffset": 265, + "type": "CARDINAL" + }, + { + "value": "Huam-dong", + "startOffset": 266, + "endOffset": 275, + "type": "STATE" + }, + { + "value": "Yongsan-gu", + "startOffset": 276, + "endOffset": 286, + "type": "CITY" + }, + { + "value": "Seoul", + "startOffset": 287, + "endOffset": 292, + "type": "CITY" + }, + { + "value": "South Korea Phone", + "startOffset": 294, + "endOffset": 311, + "type": "COUNTRY" + }, + { + "value": "Soylent Corporation", + "startOffset": 500, + "endOffset": 519, + "type": "ORG" + } + ], + "23": [ + { + "value": "Umbrella Corporation", + "startOffset": 208, + "endOffset": 228, + "type": "ORG" + }, + { + "value": "Jill", + "startOffset": 238, + "endOffset": 242, + "type": "ORG" + }, + { + "value": "359-21", + "startOffset": 262, + "endOffset": 268, + "type": "CARDINAL" + }, + { + "value": "Huam-dong", + "startOffset": 269, + "endOffset": 278, + "type": "STATE" + }, + { + "value": "Yongsan-gu", + "startOffset": 279, + "endOffset": 289, + "type": "CITY" + }, + { + "value": "Seoul", + "startOffset": 290, + "endOffset": 295, + "type": "CITY" + }, + { + "value": "South Korea Phone", + "startOffset": 297, + "endOffset": 314, + "type": "COUNTRY" + } + ], + "26": [ + { + "value": "Umbrella Corporation", + "startOffset": 209, + "endOffset": 229, + "type": "ORG" + } + ], + "28": [ + { + "value": "Purity Hint", + "startOffset": 9, + "endOffset": 20, + "type": "ORG" + }, + { + "value": "Hint", + "startOffset": 35, + "endOffset": 39, + "type": "ORG" + } + ], + "29": [ + { + "value": "Ignore", + "startOffset": 9, + "endOffset": 15, + "type": "STREET" + } + ], + "30": [ + { + "value": "Redact Signatures Redact", + "startOffset": 12, + "endOffset": 36, + "type": "ORG" + }, + { + "value": "Dilara", + "startOffset": 139, + "endOffset": 145, + "type": "ORG" + }, + { + "value": "Tobias Müller", + "startOffset": 197, + "endOffset": 210, + "type": "ORG" + } + ], + "31": [ + { + "value": "Redact Logo Redact", + "startOffset": 9, + "endOffset": 27, + "type": "ORG" + } + ], + "41": [ + { + "value": "Page-Footer", + "startOffset": 10, + "endOffset": 21, + "type": "STREET" + } + ], + "42": [ + { + "value": "Page-Footer", + "startOffset": 10, + "endOffset": 21, + "type": "STREET" + } + ], + "43": [ + { + "value": "Page-Footer", + "startOffset": 10, + "endOffset": 21, + "type": "STREET" + } + ], + "44": [ + { + "value": "Page-Footer", + "startOffset": 10, + "endOffset": 21, + "type": "STREET" + } + ], + "45": [ + { + "value": "Page-Footer", + "startOffset": 10, + "endOffset": 21, + "type": "STREET" + } + ], + "46": [ + { + "value": "Page-Footer", + "startOffset": 10, + "endOffset": 21, + "type": "STREET" + } + ], + "47": [ + { + "value": "Page-Footer", + "startOffset": 10, + "endOffset": 21, + "type": "STREET" + } + ], + "48": [ + { + "value": "Page-Footer", + "startOffset": 10, + "endOffset": 21, + "type": "STREET" + } + ], + "49": [ + { + "value": "Page-Footer", + "startOffset": 10, + "endOffset": 21, + "type": "STREET" + } + ] + } +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/crafted document.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/crafted document.pdf new file mode 100644 index 00000000..be18a143 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/crafted document.pdf differ