From 3d5455d7297e5805ad30a578d00d6d1737820c7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thierry=20G=C3=B6ckel?= Date: Wed, 2 Sep 2020 14:31:38 +0200 Subject: [PATCH] Fix redaction in single cell tables --- .../service/EntityRedactionService.java | 3 +- .../service/EntityRedactionServiceTest.java | 62 +++++++++++++++++- .../src/test/resources/drools/rules.drl | 6 +- .../Applicant Producer Table.pdf | Bin 0 -> 8643 bytes 4 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Applicant Producer Table.pdf diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index a5e7e0f4..b41f1123 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -52,12 +52,13 @@ public class EntityRedactionService { List tables = paragraph.getTables(); for (Table table : tables) { + boolean singleCellTable = table.getRowCount() == 1 && table.getColCount() == 1; for (List row : table.getRows()) { SearchableText searchableRow = new SearchableText(); Map tabularData = new HashMap<>(); int start = 0; for (Cell cell : row) { - if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) { + if (!singleCellTable && cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index f29e83d3..736c98bc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -57,6 +57,7 @@ public class EntityRedactionServiceTest { private static final String ADDRESS_CODE = "address"; private static final AtomicLong DICTIONARY_VERSION = new AtomicLong(); + private static final AtomicLong RULES_VERSION = new AtomicLong(); @MockBean private DictionaryClient dictionaryClient; @@ -69,6 +70,9 @@ public class EntityRedactionServiceTest { @Autowired private PdfSegmentationService pdfSegmentationService; + @Autowired + private DroolsExecutionService droolsExecutionService; + @TestConfiguration public static class RedactionIntegrationTestConfiguration { @@ -185,7 +189,7 @@ public class EntityRedactionServiceTest { try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); entityRedactionService.processDocument(classifiedDoc, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // two pages + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page assertThat(classifiedDoc.getEntities().get(1).stream() .filter(entity -> entity.getMatchedRule() == 9) .count()).isEqualTo(10); @@ -194,6 +198,60 @@ public class EntityRedactionServiceTest { } + @Test + public void testApplicantInTableRedaction() throws IOException { + + String tableRules = "package drools\n" + + "\n" + + "import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" + + "\n" + + "global Section section\n" + + "rule \"6: Redact contact information if applicant is found\"\n" + + " when\n" + + " eval(section.headlineContainsWord(\"applicant\") || section.getText().contains(\"Applicant\"));\n" + + " then\n" + + " section.redactLineAfter(\"Name:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactBetween(\"Address:\", \"Contact\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Contact point:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Phone:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Fax:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Tel.:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Tel:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"E-mail:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Email:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Contact:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Telephone number:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Fax number:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Telephone:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactBetween(\"No:\", \"Fax\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactBetween(\"Contact:\", \"Tel.:\", \"address\", 6, \"Applicant information was found\");\n" + + " end"; + when(rulesClient.getVersion()).thenReturn(RULES_VERSION.incrementAndGet()); + when(rulesClient.getRules()).thenReturn(new RulesResponse(tableRules)); + droolsExecutionService.updateRules(); + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf"); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/names.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/addresses.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 6) + .count()).isEqualTo(18); + } + + } + + @Test public void headerPropagation() throws IOException { @@ -268,7 +326,7 @@ public class EntityRedactionServiceTest { " section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\");\n" + " section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" + " end"; - when(rulesClient.getVersion()).thenReturn(1L); + when(rulesClient.getVersion()).thenReturn(RULES_VERSION.incrementAndGet()); when(rulesClient.getRules()).thenReturn(new RulesResponse(tableRules)); TypeResponse typeResponse = TypeResponse.builder() .types(Arrays.asList( diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index e461ff8b..c991b0b5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -49,7 +49,7 @@ rule "5: Do not redact in guideline sections" section.redactNot("address", 5, "Section is a guideline section."); end -rule "6: Redact contact information, if applicant is found" +rule "6: Redact contact information if applicant is found" when eval(section.headlineContainsWord("applicant") || section.getText().contains("Applicant")); then @@ -70,7 +70,7 @@ rule "6: Redact contact information, if applicant is found" section.redactBetween("Contact:", "Tel.:", "address", 6, "Applicant information was found"); end -rule "7: Redact contact information, if Producer is found" +rule "7: Redact contact information if Producer is found" when eval(section.getText().toLowerCase().contains("producer of the plant protection") || section.getText().toLowerCase().contains("producer of the active substance") || section.getText().contains("Manufacturer of the active substance") || section.getText().contains("Manufacturer:") || section.getText().contains("Producer or producers of the active substance")); then @@ -110,7 +110,7 @@ rule "9: Redact if must redact entry is found" end -rule "10: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study" +rule "10: Redact Authors and Addresses in Reference Table if it is a Vertebrate study" when Section(rowEquals("Vertebrate study Y/N", "Y")) then diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Applicant Producer Table.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Applicant Producer Table.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7a8785613767e7cd79483ed81266565d4c8b53b9 GIT binary patch literal 8643 zcmcgy2|QHY`!8!`i?UQ0Nuij16S5AHB}-%}V+1XsrL^U7ru z*Z1)A!XYpS&G9fluc|84l;}ccKoCHU83bzK=ShZY6PZMJnv1$RKQEc;#F7J>mld>W z-c%+84mG4WF>E0ymZJTlCd<}JJ>*I~I+@6%(5N~@CRtub1p!0AV0ah;jzHmHa0RF~ z*i56#16V2E9tu!XI?c(OME+5+(k$5NpKz#5GL^}I!0{|7fFYWv7R}cdxPw8^c(gJS z0l}f*%6No5RF6*c_5=p4Z4T8TAEA)QW_nssbEpNKNM(2u>0~O&Z*e3nL3MSg5!s1C z{5xx=payf}698rowKOwW6xAewlLy|* zdk`t^OqvSGl}=$WDMYHW50UOcrZY4cepDC0BSd8q%|ij`T+1A=VFiJKBZZ)#W&lQ8 z2m+5$hQT3d99|iQgP@QYWh4rMfx(o~IQyUA0%*}tb8kl`h;%Q7>b4Ro=HDPSBGLiu z$}U6~Duqa=lQoD=M2~-j5(R?ALr@5`G6DfX!?4O|{4Xdme~S|PZ%{Ir-b6Z6*`3UA zr;s(AJxKq=jYt$)83RG$VahlN5{*@c|AG$tx9GrS@{ecRNtsG?^QJPG8m?q_U={uo za0moW8HI+R0Jp)h5CrOX@Nj<%58Tho@Cd+)Bh#S-cVJKHkV!NrU`3FrE=*Sl3Izw& z7lR3e4-bA`-;{j21B~8VqCtP0{?0C)c`NH=1f*k(Iqu$8?a@{8!9%)J9fly~#>iR5 zbzQwvb?%Q8?PHttL$h#+4{TNq-ir(?CeBU_%{!JIsxc9p>rbY&L1*TeUdi^{wDv+w z@9wrjbz1SKp0QSX>STQKblEE|lQcE{G`rNalso;QJOwW=rQLuW3*M=JxkF!K>oy*e zlwF@@VheUBFLlzyr2nG&M$5n|RW-j|mCsv;-OJOq7xTRgOu7U|8V+*Z$6vC`b2KMuGMmnW6rhqd`EOP$3F4Z zb@jaxQD4PRrDJltkJE~@D(@&!9oHecHI2ee7jCr{_qX?R7_3&-&Azazx$#AZi)$Qr zZr24Y=epWix2ye)*S`CS>g4m>vleeM&LVH##@U6h-8rQ8pp@@|^TmaO2#5Mn=VwuV zs=d@VY&r{kS<*E-JEv3N3Bqb7G22etDlMpoo%H`cS5 zd#IRV{=we9)^^7ZK91?V8ZzkKyg9Cfu#Ou+9BmFo0*NK}*inafO*uH#)%fvlalgFZ z@72@CEj7oA45n_-4n!9ehJD`HFt(BMl$B!e&zUjm2+t8>wDKT&K(HRmwMUPejrwCZ%N-dRFM{+nl?Y9d*gUrsm$y8=ndYb4<~Y} zlzqnI6f5)w6f(7l=;8V!m0EKp5c86k-132&0uOLc-RxcOXI-slqwn1Xoy{JyknxkK z4p!f7==^={`{W(Ij;G|N-njVB5Zid`zh4f#PmK}WtzI*cSUIdR7w&lMj)(bt4s~0w zx*9%-f1O+g+z8W_MP+BoSeFnsJ(HSa&PmLbsAQ=<$)H5^XY5Vl=V~Q9=#Ra;N)+3l z-r%@;q%nI|FE4M_YrTU!TjbLrOl8 zjuyWnZCiKwfkfsdMRm7x^1;{iP*Ei#RK7{$t!!LRx;Z+S_XpF;cchk2JRbfCXIRBO zk2v1{?kzu0=hx#ST`E>)@(!r@v`Gw(y{y(P#r2C!(;F+oCukXT(@m{fPQ~&T_0Y{c z$r54Xtr2~~d-OU6_uZck@Y@3`nM8B6$E0(^6n8m=#&(DvLUdx>zJ$tijO@*SfUa@K zuU`<)Q3*lxKVYur-$s)c$Q*Ya6*KL9#%~iAz$ba+>Cw-mTI+4a(~sr(vZV&53S=b? zA8yhR%aKVo@l0~vJSwCRJ-Z9}`b)#!R&AZb?YaZ6WfGGj^&`Esi+55r+qxuYzDl2x z{Z5Efu(!XG8(pwJcc-jRsPD=55d(&*Xl;kIyXD^RXV-L!h^*HxmA@{;;R16Kb`qDp zaSghCvb3T9mB;6hbBJ3qk~gz+Wey0qorYGFD57?E&gJz4NbKt9uX61Uy^rm59*=qA zC60Z=X0uxO`|!f0(^>a<>kLY`^}PE1J^89)uCi-Jg8s z#X6P4#d@%Yi#)RLFU$CzeN1XNUDO0Wah1F$tu1PlEq_+s{_UM3goy`Pa2e9z*(}>W zs7!-6|5{B(;i$A>1K!5lIvI^A}HP4nSeseZNa=W}vjG$0e1BA++4ev!np`7oTF zXV>4plwz!K(wIBjD(U`^6Z{zpYt<3(UcgF*KJEQpZXrfPXvg&6RB?IRhOAcoyYV^u zbcSZky45LPbS^qAAkdjSdPalH#+K}qlt=m{pT9cOg6ck~?peEVoodD=CJil-fW zT2g6t=e_ut_bbdEL^qxF@)Cx{iM1qR5>7gnduinM8{UIW^oll}VM9p7ZtPdA+_xe9 zEg^UBuIhDBFlTME3wifGh+WUxLWe1Q=S_LZo0nd7rW+cjy z8|Jv>UUkpvb+$Who>MFoj@Wm#G^?sPS6cPxxsl`K5;n5p?5%opKvOTx%a+u@TcOni#7}~`zq60TR zBPn!W|5nvk)sZV>Ca62jGp^;yJP@qVk&KEK3DR{@4I^$dXje{C32I&Dwc~KL#AZd? zH<3r3#Iy{_2ALg|^6vvu*1}7h&l{Y6+x`AdWXJA5*5*R;ghck_2F`poG0tkR)**(k zeWX|9q;yJV6~a0F#Dk_-gJ=H7*aLlQ1rD@#?RU*Gq$P{qe4z2_>2QT*MInFt8+@1b zdOL5#biDfE2ETyRoX&97$w~~P=awTHJ=toBY>LU}Y>l~x*A&Ys35322 z6%1}Kmv7X3Rvk>WY0%x37}?12u7m!_f6Um1x%OT2C5|Ino1(i;{NWXDWHmv+Z8F%L*lf?Vf<2S`d?mmQ<^rF;jA`oJ+X?zw%;UmY!pO@{Q2^oL< zXDQp^pc1o-b3pVXJgAX>PLk$m%Y=8fl1eA zV+qpLHSv413qF?SaOD(CR~x^(2puO1MaK<przuzdWa#BHW z$4lCJHr_3Zi_{9!M8@A9=yppK4tjtvFv55xaBRCUp@03tJ%#W#%^OKYHM&)+wA^%+ z3Kb9S%;D{Dd^8{@<-0{q{tpqav=*tx9Vd5R18pWs()J2y*Ar+;`ee}*P}ySyR>2k&-Oaq!1x#YJ^-w))z& z#-akQK0RX@d3Q?iK==smL8^iM9`@=TCRNcJzxGtrO2eJzFK~rtn>4SVx0>}5Z=W>B z4oW^-w{6TJrgt5bRaNRWQJ0Bx1N!0g~(v)=IH?IqUB!x35>Sj9JPa)OrK=V+{ ztO7DIsaPod@UYh?hk)+7-4VltyW26O4^3Q+)!fL2v%K26k3t&_PbQ;O&kGy3J%Y93 z6NB8*{{8*6F*Y5P?9O9ViA|Z7lXU5Z#(51`6eK0VcU10LhEKF#WDxzz$z8~&q|c8( zP!DCCNr|i8F2sB1g|m(zex!kaKksdfwZy|%{{5G4qpk1xd_U(=p>JPdG3+Hf?o;RHgsLSl`MkJ?btb6T67K`_} zT;6;A+%VURi7ajN&5^2x0y`m}wq$5F_IQyM?1isXZDR@*t^aC#~D|wRpSVmPiOkPj0#Is-ePDI=X5ov!`|LOhZVT=Znw*L!R^U zhC5DN+vG%%j8I!hZ@J)q()%Qns^5J&z`u42b|22hETh6fHrpx2J0;Jvfv4i&MNg?y zRtJXC;#KymdbQrdUU@gw7LI9yUA@aW6m{SLDq|fSKKapG^Go85=QbMe9}tt8#2e!& zn;+{)ZnW@89DJ&Iv0}LW91bI5l+F98%8Px!)xp#wl40JrsIS!R$g!4XSrURBf!OkA zMeHM*dNYDYJ9TmR@NoWPu+Mi*P^sNQLUr=pGG;ZFY^xRdH*!RrON`ds9<;^H^;E2X zDBk3{)rs6#e@lqTb!Hg4PQQYZvL&cAqgV=8Uhv7B zCTY_*^`;(SYvVuqW%sM>ho79L*I9UHE#O{?4yAua4j-@ALu6il;y0maE;L-m%vL>P z>SUr>X+9Gx*DcWbGOyO!Kb#job$8@VqY+iQNl>bhwmH4#stBb5#wbzy^y%@pb`frB z@4`Z5;c%BH3kV?@aN;U?ZlT#STAM0z&P_-RVRM`zbAEeRvuJ*is&yKyu zm#;>d+q(wI&{H!=v-|SU-{zE!VVOa3GrSyMEy7D9*rwF#cBkc?z#LM+_07+4+^f^o zGtA?Afv*2(UY6^6=R|M8q0KMc8;S-)%5tP?K4mC#pJ4{u3gt{q)QQw?-)Duf zcj5N8_w=T|*ybX;W0r0LqbE8pRwiHNs-sUSDqc?HDk$i9t6~?n5mg+C_Ra6_2*UOh z+kT$n*`-s0eNVa#`5yF|zyEgfsDxo&xcJGotg@5SoSt?agODhTqSjnuWCbzGd6ktz z)~bTzw~vtJ-i*#a4M1*E;!}*7+gI|HJzG|6`^`u?`H~~Foyki)mB1@=&jv^Sc%pcG z-M6RXu}X~Wd>OJzLg}RJSw@S96CwPJ(}siYFMP(BUPWG(jRA#T)uV2x&y}rcspG!Z zA$Rg!YMsU0eIPHAC%9qrQG7MMeO+5_RYzJsE|kgIal*)Z>Qm#+^iIJkPs68beVdaz zBOX=v2(3*wm8(8F>HsY$@{!NJyD9HXwZ)849CuP))wY=UsT{+=QO~|xRW^)LyJ)wP zDe}cF4++|ta|VJ}`C$@$X))KhJzFz_kX|1s!BO!ILRu;q*C{!IcjC+78}$d>n_F`< zviod1{MYw|9kA;^7c;YcV}#PXeeWtO4~0Z3+NzCJR!aMb_n+u)8N|OvzDUn<&b?81 z(7mJD^)ci4*P%Dx*)JiG!7Q?OnMPUWeOa{SG6$n&1U077J&5j$gy0X36G|dN;48T~ z7QgwcmbNCq8nbA}AF4lTtt6SXhzv4FjzTRc9%P0r0%lD10e;PB9z-fsmkJO-6sil< zibB<-GAKXp2^2bmsqIRnLy!m-x2i*CkmwXofLDgVK`t7gxc3s5L=b2M)R^c&2H4jh z6qE(so6JJU`e6aEqBt>K0p@g(=>7@tN_Jb5K?1p893BqUJmSJq0(dVtK&E@@lPNB) zOb7;#1W4yaF(m{X1Cm)TjAf)+ERtCXjln^{MhHNT!$1{#);L%=f9DDiqSc~(6buG6 z0GN9UNt5d0PKLk$hKvZh31BawMnvC5ObDO{fMq3n><8d2&IV}XPe(zmm+&Ic0I&9E z3V`JR)}#UyE*hk>7e6>85`x9z0MZ!-K>}s)L<88-7!*qmya(kGfDEf1Z2K=KD9e%u zY=Cya6NN%S@CY~r3-GWQ43>4W%D`$vqcN;{@ScSMhhLNf+Hgx{ux(im)CbELpdG9S zPds3MxeV5Wb{0pph!@5}03rvGEMI?&7O4M$dZ`S0<*y@MMz_)%EM!X_0c|h(6^#P3 z1pn6`V9`L(D3ECdKWHpSt1o_#ti8W{_wTOYeEoNgKgas3EaSSFFXAd5ZrO zDt2no+@1cH@d9+k%6I|C!I$C%;-AC|kopI6JS$NDl(YC<2KFBY4fKzof%sQJ<3|GI=b!-xqZ=4Gz339cM)cY*u;$KD2G2e|Knp1~6d#D~Sz21@;i0zk_TsinxV(h6$;i^i9O1laxJ zLS}7xAiS^w#nKgQU9MjqG#G-wGUy1fyxb0!L0u3UOK+*|_km)u&;PGqAVjR(h$~Y_ zi;pl6$X~Lqze(N@fENky84J7-Eah=|tL89IJo- z>hdFfWD8l$3yaI3O4_8i|I0ElUTGB4d>x)IaMmK+FJ-6?Jf6IsUZ{ z0puwET8BmhK^Qz%=wcDS*5UAgTYi>NMGVeyDx?cp&ie9EK)`E4>C{C5OD zFVu`i1ODv*E*#*-fa*+xfZH6rF{k*Gfk}lL=-8^Fkh+?{R>kRJ(F8P-fYT))U>FPv ofzpP