RED-3282: Do not add overlapping ai recommendactions

This commit is contained in:
deiflaender 2022-02-25 09:29:08 +01:00
parent fd027efa5c
commit 537ccb83d7
6 changed files with 888 additions and 4 deletions

View File

@ -98,7 +98,8 @@ public class Section {
}
}
EntitySearchUtils.addEntitiesWithHigherRank(entities, finalResult, dictionary);
var nonOverlapping = EntitySearchUtils.findNonOverlappingMatchEntities(entities, finalResult);
EntitySearchUtils.addEntitiesWithHigherRank(entities, nonOverlapping, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
nerEntities.removeAll(entitiesOfType);
}
@ -162,7 +163,8 @@ public class Section {
}
if (!found.isEmpty()) {
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
var nonOverlapping = EntitySearchUtils.findNonOverlappingMatchEntities(entities, found);
EntitySearchUtils.addEntitiesWithHigherRank(entities, nonOverlapping, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
}

View File

@ -620,9 +620,13 @@ public class RedactionIntegrationTest {
@Test
public void redactionTest() throws IOException {
String fileName = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/new/crafted document.pdf";
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
ClassPathResource responseJson = new ClassPathResource("files/crafted_document.NER_ENTITIES.json");
var bytes = IOUtils.toByteArray(responseJson.getInputStream());
storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.NER_ENTITIES), bytes);
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());

View File

@ -17,7 +17,7 @@ platform.multi-tenancy:
redaction-service:
enable-image-classification: false
enable-entity-recognition: false
enable-entity-recognition: true
storage:
signer-type: 'AWSS3V4SignerType'

View File

@ -3,3 +3,9 @@ Report Number: 33168
Page
Report Number: BFI0714
Tesh Consultants International
B. Rahim
C. J. Alfred
Naka-27 Aomachi, Nomi, Ishikawa 923-1101, Japan, JP
Özgür U. Reyhan
Sude Halide Nurullah
Xinyi Y. Tao

View File

@ -0,0 +1,872 @@
{
"result": {
"1": [
{
"value": "Lastname, J.",
"startOffset": 54,
"endOffset": 66,
"type": "CBI_author"
},
{
"value": "Doe, M.",
"startOffset": 67,
"endOffset": 74,
"type": "CBI_author"
},
{
"value": "Mustermann Lastname M.",
"startOffset": 75,
"endOffset": 97,
"type": "CBI_author"
},
{
"value": "Doe J. Mustermann M.",
"startOffset": 99,
"endOffset": 119,
"type": "CBI_author"
}
],
"2": [
{
"value": "Eikenboom Charalampos",
"startOffset": 148,
"endOffset": 169,
"type": "ORG"
},
{
"value": "Schenk Tanja Schmitt ←",
"startOffset": 170,
"endOffset": 192,
"type": "ORG"
}
],
"3": [
{
"value": "Rue Jean Baffier",
"startOffset": 214,
"endOffset": 230,
"type": "CBI_author"
},
{
"value": "7232",
"startOffset": 155,
"endOffset": 159,
"type": "POSTAL"
},
{
"value": "CX",
"startOffset": 160,
"endOffset": 162,
"type": "COUNTRY"
},
{
"value": "Warnsveld",
"startOffset": 163,
"endOffset": 172,
"type": "CITY"
},
{
"value": "Netherlands",
"startOffset": 174,
"endOffset": 185,
"type": "COUNTRY"
},
{
"value": "NL",
"startOffset": 187,
"endOffset": 189,
"type": "COUNTRY"
},
{
"value": "Institut Industries",
"startOffset": 190,
"endOffset": 209,
"type": "ORG"
},
{
"value": "33",
"startOffset": 211,
"endOffset": 213,
"type": "CARDINAL"
},
{
"value": "Rue Jean Baffier",
"startOffset": 214,
"endOffset": 230,
"type": "STREET"
},
{
"value": "18000",
"startOffset": 232,
"endOffset": 237,
"type": "CARDINAL"
},
{
"value": "Bourges",
"startOffset": 238,
"endOffset": 245,
"type": "CITY"
},
{
"value": "France",
"startOffset": 247,
"endOffset": 253,
"type": "COUNTRY"
},
{
"value": "18300",
"startOffset": 282,
"endOffset": 287,
"type": "CARDINAL"
},
{
"value": "Saint-Satur",
"startOffset": 288,
"endOffset": 299,
"type": "CITY"
},
{
"value": "France",
"startOffset": 301,
"endOffset": 307,
"type": "COUNTRY"
},
{
"value": "Lesdo Industries",
"startOffset": 312,
"endOffset": 328,
"type": "ORG"
},
{
"value": "Chäppelisträssli",
"startOffset": 330,
"endOffset": 346,
"type": "ORG"
},
{
"value": "6078",
"startOffset": 348,
"endOffset": 352,
"type": "POSTAL"
},
{
"value": "Lungern",
"startOffset": 353,
"endOffset": 360,
"type": "STREET"
},
{
"value": "Switzerland",
"startOffset": 362,
"endOffset": 373,
"type": "COUNTRY"
},
{
"value": "Shlissel'burgskaya Ulitsa",
"startOffset": 374,
"endOffset": 399,
"type": "ORG"
},
{
"value": "Nizhny Novgorod Oblast",
"startOffset": 401,
"endOffset": 423,
"type": "STREET"
},
{
"value": "Russia",
"startOffset": 425,
"endOffset": 431,
"type": "CITY"
},
{
"value": "603034",
"startOffset": 433,
"endOffset": 439,
"type": "POSTAL"
},
{
"value": "RU",
"startOffset": 441,
"endOffset": 443,
"type": "COUNTRY"
},
{
"value": "Karl Johans Gate",
"startOffset": 444,
"endOffset": 460,
"type": "STREET"
},
{
"value": "11",
"startOffset": 461,
"endOffset": 463,
"type": "CARDINAL"
},
{
"value": "0154",
"startOffset": 465,
"endOffset": 469,
"type": "POSTAL"
},
{
"value": "Oslo",
"startOffset": 470,
"endOffset": 474,
"type": "CITY"
},
{
"value": "Norway",
"startOffset": 476,
"endOffset": 482,
"type": "COUNTRY"
}
],
"4": [
{
"value": "Expand",
"startOffset": 67,
"endOffset": 73,
"type": "STATE"
},
{
"value": "Hint Clarissa",
"startOffset": 77,
"endOffset": 90,
"type": "ORG"
},
{
"value": "Dict",
"startOffset": 114,
"endOffset": 118,
"type": "ORG"
},
{
"value": "Authors-Dict",
"startOffset": 171,
"endOffset": 183,
"type": "ORG"
}
],
"6": [
{
"value": "Michael N.",
"startOffset": 7,
"endOffset": 17,
"type": "CBI_author"
},
{
"value": "Weyland Industries",
"startOffset": 76,
"endOffset": 94,
"type": "ORG"
}
],
"7": [
{
"value": "Funnarie B.",
"startOffset": 7,
"endOffset": 18,
"type": "CBI_author"
},
{
"value": "Authentic Diagnostics",
"startOffset": 73,
"endOffset": 94,
"type": "ORG"
}
],
"8": [
{
"value": "Feuer A.",
"startOffset": 6,
"endOffset": 14,
"type": "CBI_author"
},
{
"value": "Tyrell Corporation",
"startOffset": 74,
"endOffset": 92,
"type": "ORG"
}
],
"12": [
{
"value": "Melanie",
"startOffset": 292,
"endOffset": 299,
"type": "CBI_author"
}
],
"13": [
{
"value": "Stark Industries",
"startOffset": 184,
"endOffset": 200,
"type": "ORG"
}
],
"14": [
{
"value": "Omni Consumer Products Do",
"startOffset": 197,
"endOffset": 222,
"type": "ORG"
}
],
"15": [
{
"value": "Omni Consumer Products Do",
"startOffset": 122,
"endOffset": 147,
"type": "ORG"
}
],
"16": [
{
"value": "Asya Lyon",
"startOffset": 253,
"endOffset": 262,
"type": "CBI_author"
},
{
"value": "Carina Madsen",
"startOffset": 264,
"endOffset": 277,
"type": "CBI_author"
},
{
"value": "Alexandra Häusler",
"startOffset": 279,
"endOffset": 296,
"type": "CBI_author"
},
{
"value": "Hanke Mendel",
"startOffset": 298,
"endOffset": 310,
"type": "CBI_author"
},
{
"value": "Kwok, Jun K.",
"startOffset": 444,
"endOffset": 456,
"type": "CBI_author"
},
{
"value": "Tu Wong",
"startOffset": 458,
"endOffset": 465,
"type": "CBI_author"
},
{
"value": "Qiang Suen",
"startOffset": 467,
"endOffset": 477,
"type": "CBI_author"
},
{
"value": "Zhou Mah",
"startOffset": 479,
"endOffset": 487,
"type": "CBI_author"
},
{
"value": "Lei W. Huang",
"startOffset": 499,
"endOffset": 511,
"type": "CBI_author"
},
{
"value": "Ru X.",
"startOffset": 513,
"endOffset": 518,
"type": "CBI_author"
},
{
"value": "Oxford University Press",
"startOffset": 166,
"endOffset": 189,
"type": "ORG"
},
{
"value": "Iakovos Geiger",
"startOffset": 222,
"endOffset": 236,
"type": "ORG"
},
{
"value": "Julian Ritter",
"startOffset": 238,
"endOffset": 251,
"type": "CITY"
},
{
"value": "Asya Lyon",
"startOffset": 253,
"endOffset": 262,
"type": "ORG"
},
{
"value": "Carina Madsen",
"startOffset": 264,
"endOffset": 277,
"type": "CITY"
},
{
"value": "Alexandra Häusler",
"startOffset": 279,
"endOffset": 296,
"type": "ORG"
},
{
"value": "Hanke Mendel",
"startOffset": 298,
"endOffset": 310,
"type": "ORG"
},
{
"value": "Ranya",
"startOffset": 312,
"endOffset": 317,
"type": "COUNTRY"
},
{
"value": "Eikenboom",
"startOffset": 318,
"endOffset": 327,
"type": "ORG"
},
{
"value": "Min Kwok",
"startOffset": 440,
"endOffset": 448,
"type": "ORG"
},
{
"value": "Qiang Suen",
"startOffset": 467,
"endOffset": 477,
"type": "CITY"
},
{
"value": "Zhou Mah",
"startOffset": 479,
"endOffset": 487,
"type": "ORG"
},
{
"value": "Ning Liu",
"startOffset": 489,
"endOffset": 497,
"type": "STREET"
},
{
"value": "Lei W. Huang, Ru X. Wu",
"startOffset": 499,
"endOffset": 521,
"type": "ORG"
}
],
"17": [
{
"value": "Nurullah Özgür",
"startOffset": 210,
"endOffset": 224,
"type": "CBI_author"
},
{
"value": "Reyhan B.",
"startOffset": 228,
"endOffset": 237,
"type": "CBI_author"
},
{
"value": "Alfred Xinyi Y.",
"startOffset": 250,
"endOffset": 265,
"type": "CBI_author"
},
{
"value": "Redacted",
"startOffset": 12,
"endOffset": 20,
"type": "ORG"
},
{
"value": "Aomachi",
"startOffset": 154,
"endOffset": 161,
"type": "ORG"
},
{
"value": "Nomi",
"startOffset": 163,
"endOffset": 167,
"type": "ORG"
},
{
"value": "Ishikawa",
"startOffset": 169,
"endOffset": 177,
"type": "CITY"
},
{
"value": "923-1101",
"startOffset": 178,
"endOffset": 186,
"type": "CARDINAL"
},
{
"value": "Japan",
"startOffset": 188,
"endOffset": 193,
"type": "COUNTRY"
},
{
"value": "JP",
"startOffset": 195,
"endOffset": 197,
"type": "COUNTRY"
},
{
"value": "Sude Halide Nurullah Özgür U. Reyhan B. Rahim C. J. Alfred Xinyi Y. Tao Clara Siegfried",
"startOffset": 198,
"endOffset": 285,
"type": "ORG"
},
{
"value": "Dict",
"startOffset": 301,
"endOffset": 305,
"type": "ORG"
}
],
"18": [
{
"value": "Redact Emails",
"startOffset": 12,
"endOffset": 25,
"type": "ORG"
}
],
"19": [
{
"value": "Central Research Industry",
"startOffset": 274,
"endOffset": 299,
"type": "ORG"
},
{
"value": "Maximiliam Schmitt",
"startOffset": 530,
"endOffset": 548,
"type": "ORG"
},
{
"value": "European Central Institute",
"startOffset": 727,
"endOffset": 753,
"type": "ORG"
},
{
"value": "Emilia Lockhart Alternative",
"startOffset": 775,
"endOffset": 802,
"type": "ORG"
},
{
"value": "Cyberdyne Systems Tower",
"startOffset": 812,
"endOffset": 835,
"type": "ORG"
},
{
"value": "121a",
"startOffset": 844,
"endOffset": 848,
"type": "CARDINAL"
},
{
"value": "Hong Kong",
"startOffset": 849,
"endOffset": 858,
"type": "COUNTRY"
},
{
"value": "BT",
"startOffset": 860,
"endOffset": 862,
"type": "COUNTRY"
}
],
"21": [
{
"value": "Central Research Industry",
"startOffset": 270,
"endOffset": 295,
"type": "ORG"
},
{
"value": "Maximiliam Schmitt",
"startOffset": 526,
"endOffset": 544,
"type": "ORG"
},
{
"value": "European Central Institute",
"startOffset": 723,
"endOffset": 749,
"type": "ORG"
},
{
"value": "Emilia Lockhart Alternative",
"startOffset": 771,
"endOffset": 798,
"type": "ORG"
},
{
"value": "Cyberdyne Systems Tower",
"startOffset": 808,
"endOffset": 831,
"type": "ORG"
},
{
"value": "121a",
"startOffset": 840,
"endOffset": 844,
"type": "CARDINAL"
},
{
"value": "Hong Kong",
"startOffset": 845,
"endOffset": 854,
"type": "COUNTRY"
},
{
"value": "BT",
"startOffset": 856,
"endOffset": 858,
"type": "COUNTRY"
}
],
"22": [
{
"value": "Riddley",
"startOffset": 236,
"endOffset": 243,
"type": "ORG"
},
{
"value": "359-21",
"startOffset": 259,
"endOffset": 265,
"type": "CARDINAL"
},
{
"value": "Huam-dong",
"startOffset": 266,
"endOffset": 275,
"type": "STATE"
},
{
"value": "Yongsan-gu",
"startOffset": 276,
"endOffset": 286,
"type": "CITY"
},
{
"value": "Seoul",
"startOffset": 287,
"endOffset": 292,
"type": "CITY"
},
{
"value": "South Korea Phone",
"startOffset": 294,
"endOffset": 311,
"type": "COUNTRY"
},
{
"value": "Soylent Corporation",
"startOffset": 500,
"endOffset": 519,
"type": "ORG"
}
],
"23": [
{
"value": "Umbrella Corporation",
"startOffset": 208,
"endOffset": 228,
"type": "ORG"
},
{
"value": "Jill",
"startOffset": 238,
"endOffset": 242,
"type": "ORG"
},
{
"value": "359-21",
"startOffset": 262,
"endOffset": 268,
"type": "CARDINAL"
},
{
"value": "Huam-dong",
"startOffset": 269,
"endOffset": 278,
"type": "STATE"
},
{
"value": "Yongsan-gu",
"startOffset": 279,
"endOffset": 289,
"type": "CITY"
},
{
"value": "Seoul",
"startOffset": 290,
"endOffset": 295,
"type": "CITY"
},
{
"value": "South Korea Phone",
"startOffset": 297,
"endOffset": 314,
"type": "COUNTRY"
}
],
"26": [
{
"value": "Umbrella Corporation",
"startOffset": 209,
"endOffset": 229,
"type": "ORG"
}
],
"28": [
{
"value": "Purity Hint",
"startOffset": 9,
"endOffset": 20,
"type": "ORG"
},
{
"value": "Hint",
"startOffset": 35,
"endOffset": 39,
"type": "ORG"
}
],
"29": [
{
"value": "Ignore",
"startOffset": 9,
"endOffset": 15,
"type": "STREET"
}
],
"30": [
{
"value": "Redact Signatures Redact",
"startOffset": 12,
"endOffset": 36,
"type": "ORG"
},
{
"value": "Dilara",
"startOffset": 139,
"endOffset": 145,
"type": "ORG"
},
{
"value": "Tobias Müller",
"startOffset": 197,
"endOffset": 210,
"type": "ORG"
}
],
"31": [
{
"value": "Redact Logo Redact",
"startOffset": 9,
"endOffset": 27,
"type": "ORG"
}
],
"41": [
{
"value": "Page-Footer",
"startOffset": 10,
"endOffset": 21,
"type": "STREET"
}
],
"42": [
{
"value": "Page-Footer",
"startOffset": 10,
"endOffset": 21,
"type": "STREET"
}
],
"43": [
{
"value": "Page-Footer",
"startOffset": 10,
"endOffset": 21,
"type": "STREET"
}
],
"44": [
{
"value": "Page-Footer",
"startOffset": 10,
"endOffset": 21,
"type": "STREET"
}
],
"45": [
{
"value": "Page-Footer",
"startOffset": 10,
"endOffset": 21,
"type": "STREET"
}
],
"46": [
{
"value": "Page-Footer",
"startOffset": 10,
"endOffset": 21,
"type": "STREET"
}
],
"47": [
{
"value": "Page-Footer",
"startOffset": 10,
"endOffset": 21,
"type": "STREET"
}
],
"48": [
{
"value": "Page-Footer",
"startOffset": 10,
"endOffset": 21,
"type": "STREET"
}
],
"49": [
{
"value": "Page-Footer",
"startOffset": 10,
"endOffset": 21,
"type": "STREET"
}
]
}
}