From f84a3663284cc8c3b1e5c76be2fc0362e8cee717 Mon Sep 17 00:00:00 2001 From: deiflaender Date: Thu, 11 Aug 2022 11:44:31 +0200 Subject: [PATCH 1/2] RED-3974: Use first row as header if header detection does not find a header --- .../service/SectionTextBuilderService.java | 88 ++++++++++++------ .../v1/server/RedactionIntegrationTest.java | 30 ++++++ .../src/test/resources/drools/rules.drl | 9 ++ .../files/Minimal Examples/NoHeaderTable.pdf | Bin 0 -> 19604 bytes 4 files changed, 96 insertions(+), 31 deletions(-) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/NoHeaderTable.pdf diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java index fb4b0e81..1d83a4b5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java @@ -1,19 +1,35 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import com.iqser.red.service.redaction.v1.model.Point; -import com.iqser.red.service.redaction.v1.model.SectionArea; -import com.iqser.red.service.redaction.v1.server.classification.model.*; -import com.iqser.red.service.redaction.v1.server.redaction.model.*; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + import org.apache.commons.collections4.CollectionUtils; import org.springframework.stereotype.Service; -import java.util.*; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; +import com.iqser.red.service.redaction.v1.model.Point; +import com.iqser.red.service.redaction.v1.model.SectionArea; +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.Footer; +import com.iqser.red.service.redaction.v1.server.classification.model.Header; +import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; +import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText; +import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; +import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; +import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; @Slf4j @Service @@ -35,8 +51,7 @@ public class SectionTextBuilderService { } sectionNumber.incrementAndGet(); } - sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph - .getImages())); + sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph.getImages())); sectionNumber.incrementAndGet(); } @@ -62,20 +77,26 @@ public class SectionTextBuilderService { private List processTablePerRow(Table table, AtomicInteger sectionNumber) { List sectionTexts = new ArrayList<>(); - for (List row : table.getRows()) { + + boolean hasHeader = table.getRows() + .stream() + .anyMatch(row -> row.stream() + .anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty())); + + for (int rowNum = 0; rowNum < table.getRows().size(); rowNum++) { SearchableText searchableRow = new SearchableText(); Map tabularData = new HashMap<>(); int start = 0; List cellStarts = new ArrayList<>(); SectionText sectionText = new SectionText(); - for (Cell cell : row) { + for (int cellNum = 0; cellNum < table.getRows().get(rowNum).size(); cellNum++) { + Cell cell = table.getRows().get(rowNum).get(cellNum); if (CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } - SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell - .getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() .get(0) .getSequences() .get(0) @@ -86,16 +107,17 @@ public class SectionTextBuilderService { int cellStart = start; if (!cell.isHeaderCell()) { - cell.getHeaderCells().forEach(headerCell -> { - StringBuilder headerBuilder = new StringBuilder(); - headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); - String headerName = headerBuilder.toString() - .replaceAll("\n", "") - .replaceAll(" ", "") - .replaceAll("-", ""); + if (hasHeader) { + cell.getHeaderCells().forEach(headerCell -> { + String headerName = buildHeaderName(headerCell); + sectionArea.setHeader(headerName); + tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); + }); + } else { + String headerName = buildHeaderName(table.getRows().get(0).get(cellNum)); sectionArea.setHeader(headerName); tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); - }); + } } for (TextBlock textBlock : cell.getTextBlocks()) { @@ -122,6 +144,14 @@ public class SectionTextBuilderService { } + private String buildHeaderName(Cell cell) { + + StringBuilder headerBuilder = new StringBuilder(); + cell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); + return headerBuilder.toString().replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", ""); + } + + private SectionText processTableAsOneText(Table table, AtomicInteger sectionNumber) { SearchableText entireTableText = new SearchableText(); @@ -130,14 +160,12 @@ public class SectionTextBuilderService { List cellStarts = new ArrayList<>(); for (List row : table.getRows()) { - for (Cell cell : row) { if (CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } - SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell - .getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() .get(0) .getSequences() .get(0) @@ -149,7 +177,6 @@ public class SectionTextBuilderService { entireTableText.addAll(textBlock.getSequences()); } - cellStarts.add(start); start = start + cell.toString().trim().length() + 1; } @@ -170,8 +197,7 @@ public class SectionTextBuilderService { SectionText sectionText = new SectionText(); for (TextBlock paragraphTextBlock : paragraphTextBlocks) { - SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock - .getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null); + SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null); sectionText.getSectionAreas().add(sectionArea); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 6b97870d..fc3debce 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -882,6 +882,36 @@ public class RedactionIntegrationTest { } + @Test + public void testTableHeader() throws IOException { + + System.out.println("testTableHeader"); + long start = System.currentTimeMillis(); + + AnalyzeRequest request = prepareStorage("files/Minimal Examples/NoHeaderTable.pdf"); + analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); + AnalyzeResult result = analyzeService.analyze(request); + + AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder() + .dossierId(TEST_DOSSIER_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) { + fileOutputStream.write(annotateResponse.getDocument()); + } + + var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); + assertThat(redactionLog.getRedactionLogEntry().size()).isEqualTo(5); + + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + System.out.println("numberOfPages: " + result.getNumberOfPages()); + } + + + @Test public void testFindDictionaryEntryInResizedEntryPosition() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 35d751b7..8e7122a6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -381,4 +381,13 @@ rule "30: Ignore dossier_redactions if confidential" Section(!fileAttributeByLabelEqualsIgnoreCase("Confidentiality","confidential") && matchesType("dossier_redactions")); then section.ignore("dossier_redactions"); + end + +// ex. "New Rules for PAD" - "Annex A" - page 21, page 35 (table without header), page 38 (in-text) +// https://www.regexplanet.com/share/index.html?share=yyyypb71xkr +rule "101: Redact CAS numbers" + when + Section(hasTableHeader("Sample #")) + then + section.redactByRegEx("\\b[1-9]{1}[0-9]{1,5}-\\d{2}-\\R?\\d{1,2}\\b", true, 0, "PII", 101, "compound/sample identifier", "Article 4(2) first indent of Regulation No. 1049/2001"); end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/NoHeaderTable.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/NoHeaderTable.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3a1e959a9d33db7ac222f8d221f0817d2864ce45 GIT binary patch literal 19604 zcma&NW00mnvn|@TZQHhOPusR_+qSJ~+qN~+wvD&lcfNhjJ^MzStEi|asuES}$zK&K zGs%_2BR|6`X7BpXo|TCCpR=g5gQJLprw-FU6Aoq;B2I1= zJw|ymQ!AtYO{!$%{Lk}$UZi5?;^5|N@}G;@{tJbQgR7CN84(kss;je`iR=F`{-@<; z{4Y%Z0r}s6xc?iFgq5wUnKPq=?LTiZGZP0>Ge$Wxdka@fB6cpOf2X+_yZ+~&{&NVk z=)V(OT%FB~>|ms#Y}KsP&@?do#Z^~l*I>k<-MRJfvCzSi^ptSLhe#ktfP#r)py&%{ zNsy2zxehp-r1|D5Gx6}Xx#PhkflHvEtdYbWgAj_@Iyl;(j9o`^b&isDwPcR(H;uO7OZ(H*4|rdF?d zd~i|g6o*9@9h_sI z52X|dy(3Z$r2@%9o=BD;Y@`AzDqB{j5eb5lMw*hS1S{eTRUmQ2DEr zk%N){E3Y3ZBEBweh-3EG_&r!xc86G!3Y)zky8K}@=)m|sfb1M4u2hu8I^D8u`--+% zz=icOVdikAJ^PZ*T=S~I)28!J(@@hqO6^u%Qyon$gVuSiP?zMpXY^w#zG}W)Yk{|O z_0HcB?;yAOXM{^h4wf9#oX3_>gd&@BBb_<-D!U}RxNDtoBbF>ZId)dQS*z^b77h1+ z7TPw8Ymd!!MQ^FFb}`V!?S>XyfqJsnr6?(iz^S1cDO>r~AsT1qG%L1R5nyHOp41Vw zGaZ>Q)=9_}?g=uZiys{GZ3H!VI`V)k_YywQEX2#Ys^wrwn5dkV#M)I$V)5?&V8RXi0DAO$vkdwJFJh6tzQFd?CSa6K9Ru?a`8&n z?~e;DUd+C|+@vw`5gxC{9dfPC(%>U@+|!mj>tjTFlE3RO)SO*G^KnXL95aljC1VU_ zWFM1ITb*ZZax7s3(3= zOd0Xmo1cZFGuuQT{3%3VZ(X>g!25>S zoZO^hbd(Df-=hcpjiH1P6!NKo1jdinJEAp%+i)4|c$XMt0TwzB!RtlG9*VDXM{-3! ztRJO$9tNIRtP48b3Wm27{fbj*>c~?HdFYx;02jOvY^XBiHqd-opn>uZAo=1t+)$=L z_@k2CqEnAzGtVJ#IG{pD7J=UjwlZi3;Q$+KC$X^OBqGQVj0JqaW$&75Nudsno(|(P zbS#*WVuzNhn3}Rw?9-zl$%a)pM-;4z3O&uj1t+m^C_9g+#5k@*N^Nd@{= zF)ZDFRClSK3uBU2UNXzTg+oa>@8&M+4ex}s&c`z)4X~!z3KK&YC8^x&H>Bvt%9eT+ zs~r=YwD24=mk!nJ(wH|mqx1|2iTqKSm&;a${9RDLY~cTx_l3Bt*H@9TO=T-vld)bg zN(gJhKg1zs<_KP0SJfRtCc2W5fuN?5o5+@H$>WuVe-){Z*X-{mM#0HQz&mSz+J@h) zc)FU}A_Yl#Gi>q2F)Qjgf@ECiwE@L*!t|>>p>#K)wfxKjfL7?U4 z3_{wnj>9H4OpDYz5xYs-uZBCj-{qm8Zg*O!YmdOZ@z9vu{Y_Uu?Am6-!0ol;NcVWO znnQg(w}-5Vjq*@Y*QoQIG0BGv1bB~cKoSZM4h9cS1SkYFoM`VW5VW5{H}+OhBE{ag zgJob31O}!shHb58eUOiTkP(e*5);8_DKt7KS3;mvd#-@{?^wKhDDR9e@;OSPt9^qe zRp&v9s8@Dea@y`oHKdlKFDrtJ(Eba|yp+^#a&tN_codoL`@pdLw#h6jzSBK78&vJxU3xH_{n$;7u1%= z^{P%=GcoBCErN~-;mP?bdVRj~KI|uHw*j2n)(Y-?UI6{oUS;FEu0wu#1IjJjpn_9} z9196tN%AQY;!YwxbGlm>B8(e{Teu_W6*AaYyoDEQI6=2ZWA#ftoY)SJZAe{js%A?q zO*5ysLcdO8QlAL*@Up&7APCxT_yM@m-8(_baTG2(pj!hOc&+qI9IQ;wdwY-RQ2~VZ z?4{dq9^=mw$x+W&u~tF`0 z2?&fb#HFsIYwRR;8H{EZa}$J3+^vBZd)Z$)s1cxmk&YQ-()HX+hQon$gL%FSxV+O| zzNK-_o3YBG!xzyTQB;w6E?^1~()yn=WUsjM-(RmqHC}=yuf0$lP>?2YTzmOHyNA&d zQl_sgxe5r8l*fDH(}UxcIQ%N$ku5(*hNJe^WA z(~Y~Ocpq55N+e;vHi`&K|K91`Jz;P?ZRUExu)=>DJU^vGb$;5PESs772><15q)XqY z-XNGiI-`yME6(_v1JA)Od09~%zjk+H^kuvDremgcyV|QVMVLk(T1GdVzCr##gt4>P zA~g^lLK?Vq4~c7m^YvNK``F84lH}*ij#eu->~z>RGRSSHM;9V1_#_0~lbGXixYWU4 z*=%+2YyA7gpt^>iE}AqdAji!7Wnph}C(%XH;>d{r2RH-^aUTsC41G|N2t`RKF@uhX z?qR}Mj>voOQdM}dl8S~187+1B4oL~&5OlDxqcCJ5n3G1DExBgr^7zL4Ue?+FOA@fI z_e*v4v8LJ}14`5}edy2rRcj1#Pyp&faLQ@97;UQjWlFJXNoT@P_TND{zy-4vE!msE zxo{~^k(iWmw-d^9_8536>;63(14-Nkq&B29+SRR5V|w`gf+8|G@=IhPI=v71S=uK# z`9&*8(oK(ZZJUHrd~OYcz4fni(j5Hx`%(%uwbm^S_KJ18%A*<%uz=&)eF*I#56y*O z#$~nq948sBJbx+ts^}?&jnp}dQnC~O86$(w&@leQIbDHf_Ghq0ZxI(qlUSOU<~S=H z0e1ObH+&1_BILyes+x?WaBNFRScWJC{*ADln|vfBDmpDRlbBiQWnPE%^U>cuYiXak zA)|tQfdi;RAFc{_r%r#x!~?y(@CGjna)pE$*n|+_G~4xN<#svLX|1eMO%~zr3^=4AKTP|ZjID+%KSbbC)u*`zfXk9~ zR#WU5`brkQj^vF!EF5-H`+|GNR$U|DMA~^iWaS1+GCvo3v4ZWoc>&1^YznPg`ZS2(-n&y+9Bz(7rp1XQxzdV) zn?G)c{h}(VvELZPur(ARg*MxC!t@@(HU!@ZVh)H!w56}TrBVK@S5=aURRO%f2pG?i z@75Y(?)RPNRYb4t#8G6Pt+f4H@U0CCci-f63<+9fQw0hf!Zh@WeM6#jyS{+RFH+qu z$J%(cFX~6vjrehTia??{I2fWxY+D2;7g!i4jGdR+2I>O01?41}Fg&C6`o8{L8N`Hn=);DTZHF7~pk8eHE4Vv7J%!Hqd@FD*QtJ$QhX%loc_6L zP?-;9+wuLLCY|c6)JX-iyoK|UY^Z8Z%)zjl%TV!^*HEzdQ`2~pe!p1o?Xy6#N(7l} zBKYR#x!*?KFvCmzYr>pcjX^0?16imF!Z&Lxhj$u{wTr0;PCgzSQw)9I7(=m&rJ@bv1wk zr!QjkrZB6=LwPFuLNOcy^2FElKw}~)4zC`uM=I&0%tdk#WBRkKN!GFW0V zps-^${U94hpcSReVk2A#-+@#cQ+${@q+lIt!!rZ4qWFIc;Ni6jmN_MJjqcJQla74F*xSj4Sm5Jog;Uwx0lu z;DC(0qIb&ocGZTfl#E15kk=m{;unLbWy@CleJhXN+afKV>A%2K7pmrkzi~B>k9!!~ zZa2or{k>mXfOc7u6-mu8U(3WtAF3t`Y90mD`N|!xYZz|#yKGW|!`LX?o86@m5@uVo zHaYE;Un|H$-z!tGSvNub?Hw4btPVq7@)hj%5tx%%KSfUGvVZH*YSs`4g=X3(M1%&Ah^s}IUznnUI=3L} zH*Nm5<_cYNFyCPJ`F+dQnSD38wf^^h-C^gy8S2yFW5h?mT6s zw39LlI>aIqNDAN&Y^R(<%OD^vosum58kMX8T<;_W#NL{|nA&b@Xd(h?SFiTMDR;PoLlmTzSYrw9zd@C7JD0d_;e8nSJ2 z>YcB5Q(M=eCu7$!_j6F+LR;=;*tfzvhDvaV4;|-*|bII8+*McLke z3cZxHyJ)2+2^7_AqtHtFpL+0DFLeaX)i>JS2I*1{U%C(i2r9q?%PQd`2EczpS2PPL z;TKPq@w%9l*NQTE!xiO0*|6~XIR(nnDJU2=1NMX@NG|35Y9YfhWaTJMB9di9Z+6+4b4mA`^C#wLeM zFd>@ZafcG7$(t5t`;53|Z~y+eL~QA3XzC4SFXpgg1Z*-0!-XY4mKQa>+~1;@=~~j$Ozizu;t98ISG< zfHkknlE<3yc%+Fbhh0yI%ZuQf@D2)BpXbIqRCSLV|_s^y%SCsPJ8X?{HRSt)BSm&3I8 ze6{d_3B_HyJd3G2soYz+xIB9`HZF39G&#sU9PG8pZ;B^qWUmoR$e5aHp0jQ)vlmCV zo_t08*~%MKd>c2^ouAToS^g%;(iH2YP0d`P z)`tHr$WHDPZ8EPYP6mAfQY`0?l2Xr6=Qo9Z7FbJXOe~f~)3a%p;xN;H?KDrXD3!3V zmbon6zBu@M}i@);p@P+Oh14&s!mAK2WGb*+ku`H<} zy_Jb29>>wUz6-;JuJ<$)e2ew?7(SEugw=)}$G!^}1*wIH5#m4+>PTk7rA{PNLD6&! z04ZVu2}4l??<>kw7GuuafbQ3K zfKrLpexK2HRL1g4nz~cmY2lbPW_P3iSPAnvu<#_&*QAoegM9f35>*VQ+ib@X z`VGfY7FW%9)q_Lm+CGK!q|WQAkaMkbJ)FNu(g&@LZSrW_m(|kO@CA`;1>y9Vg3Cc| zja3t5IO|Ivx%zaeMg@TnYMlI^M~fgssH}mwEnmyMBb`rsrrg9I53*kBueJQ1i9-xe z+h1M^i{MTkmT9yx>NsEQWTu%RSXkBdO!i)9Z5w5}Xg#J2I7d{KA;B#clI@R~QB z!`_i%`GEy4{wJEie@P|Ta3_=S*N$8gd7B_9=`eZ*3Gj?CEWW@MIfDaApvGV+SuL47 znU{~8tJL@_nsv6<>N0A|bOYbgF7C=?hlW8z)K_#nbyZ!$aFJvODIV}v68383w5?A2 zKtCGP-}_PzPqQ=87?~zd2`PFNmWI`7iaKmP!o;>6i@)hG6ScZJm7U>gicsv8{0I4t z1!3qWO9#|$MS{q`48S?kcIRRx=SJ93);%nYrP2)Le_1nuF-HOon3X3>*C=KWU1 zkK&$zg4)X%ynQCZ*(TG@#XQL_ziyFoSh8wLE@^zMEGh%SZu{iys^{u@&KVIA(MjzD zH)ca4ohCsZg>gS%g(XZSjY?ML5#{4<2PTY98?RsAI?G636GuIA)ez^3(ifg#l<}Z{Ms*ICS<4#5-gzOF8J;bU^O_Bx!ZxZ^ zj29_zeikE3kJX3fRj5JKab|11=L`STBbtsWCV_s8UXMP6E{6WHZm0RLott8Z@CIFO z*3alCu??h+!6n?~;~Gy+7>5Ex)WU1>`?VPtS-3oxc7aqe;vxiOY7hyvsd88?drhW2 zQpvP0o9JMwQzppz0M!Zn$wHzuWO7RHy3b8Z9E`p22M<-SI~o~Ek=-lRBj&=M+5t&< z;t+lrsUq4WD0#Oh-gmH^f!q^Nf^I8K><(qOf1#=>i+KQsHj=Mp#}5U9!H@vTH7K_L z*cPOxram{qF{2dZ0H^^Dr+eC^=#-EvSbU~29kJ5%0 zjOL_D@MtDsO3BOzQ2a=O=z@$!FU1b110Rc$1Jj1drXZM*b|^g)F)%ER;@8q~SL{}! z2zIo9B!k-8n<`?^`5o*9x0faLlvn*+t(5=!G9g9u0wk)F83huuBph z6{K8jt668ToqP{M^E=_O+JkXayRZywEFLf(_vY(#u;Oq{MFKP_x=TS9o;}$+d!l^ zt+#sMZIL&51fmH^#Ief5J%JGGK0v+@jw9EDzB95i6)elSP>CScS1xTisz<)Zb5~%4 z0=W)micN3S+g7)^FEM{YX8`Nc%EbpVE)@@6raeRik;jy=wBApz>m5n*J}>wL zu$$~6ARP^r^f!hrNf_8#(EDp_iUT}HJ!8`?D!bBW(JiFY<=Fob^Ejpbv$fTv6^Y98U5Tgh@Rrb32R?y7U&hG-a>o#KP;&$VALIKZ? z5-HL#8r`W*%KTn!a6F4{!Xu3DB9W1zuj^?@Xp#LP?8PR8vW zb{~BJE3~r8-^hIW;Y~LR-8QGOy>JsB5RdVSij5i-lP2S{!i|fyAxO)MKS&7`fsDulI#ECnsW0wTzh+RCp zPH4{!CgGm!UN#f%{uQTRJ=NNN6j4x==hljA+w53>h>=^XjqP;6P}i7~*(3yUYR;}q zirzvt9obzgZm1k2+gP#}*wgHv2o#<7vRRta>mqpx+QOV=9pPS#_(0XgxE68A(&(LP zt2s1Wva;*_m)Iw^dinl*N9Nmo1z7DX_YK$)@k2r9D-UL?zTEj8KM#om&=+N5wn}TD@Jatv#NewSo~DIl4%fpGi=KiNk{Z8 zd}KuZwaq`%e*GM>t7`G1o%P6`qMxCLp&u)-un`5Zp07Y}Q*@!ArtypEl+p5g+_e>g zJ;}_7)O~n{E7eSq?%Kv}UZJzOUggjRTpTjg2uK-8c}KB?$YJo8KJ>m3k~9g-gUpa@ zB2gk#%>znsM`d9T^n@tAYe=^?&#w|y_I6eLnimQu>)OxrIc~qX&eNS+-rZlZ!9g!q zD>bfe65_ffS+p2>Qf0BOCF2DynRxspO}ePPatGLvJP(fP?@+N=J*MGAx?wLH^Ra^{ zwkA_IqI3`(KCX%r3{x=ZY)4GwP=NvLxnp9D#fHF>HL$kQrN#CMR)K2QE)xPrCZrZOxYWvgY{H(9!J= z3G#QmayOlrpBEB7PU5tyDin;p+t%bwGwnz0`8U+Gsc3p73^Fkb^U-<`dY$=0`2M_s zin@N?)!M6S;_K5n^-mc6RUIVPsXD~1`#RH}bHWkXt25yEi{PZnHc@QLT0X%Va~dNV zrc>aTk@ZL6o52c99IL(+F$6gmp|6lA^^Urq-=3Q4-XLj7^fU1Zen7bEea1E&%X#0t zek=%N5X<&_KbnYp7)4s1YV123q6s@!{mf9YzC7a_Gfw7ici*zG<&fV~X;aKqc+6B) zGW}__rP>Ex&V&uK0!yn22=~!O+&U9T;z}lL=tVa&xDfM{^}CnlxY=mdqk(DoeZ2dY zuZjcgJs$oY?&@wC{u||1I3NiQ5+6s>Vh6hKk7f#&k!<{7>Q)hi#;Y4rTlWx{fg{^A zBhZvq#W7#TGxzQB_F7-`)m7CZ3^pw!ORw=8thrz5`%V@-pmx1)`q1AK%U@zjv6zn0 z3q5$1cs;_`X#gjp>$`%Od|GsgDI!*uDJy%@U8TQ9^r;xkR^i!HGQVbr*rnxoxDRNS0( zwcs3InU`22Up5IRjdi5bS&v`8??uh}!>1I7H=d*V`T~GQayOmKF!fBmM(0%MMn+D_ zyxvNOP4{kLRiE+oQp+W|OG8I&P6o7%qh_Cv5ZWPcALTrF(8YX4y(I}iZpPVQL(+!V zZbw?r$d5d4)C+~lP1>EF>(#4W9#CV$>lFq4{ zQD|!u(=$?FfjzEXz{@-dOK*mG*fzzfbgki^#K=fQVOxf^U`6TQhqyEsaZyfUq%vca z1fNp>%;I+zQNJVbCX-LV0iD8N|TM2DAt+ti{Ge9(#Z5AA=@nsV{PdUlKI9Xqc%t1OPUmFxJ75sW9TC-AzY0swt0M-$G!6I$p{Ju)Pfdi^j;-)if7_~9QQh51zucEYzEY-s1nF2_6oIlW zsxt>`{Uqg1GEwaQSU8(7MnS7`8UV}dxLWE55fv!%Hr@(enuBv+u6Iv61tb>;n#Eg4 z+^Ifd|Abo|e>Z$&ssiD=6Q%3f%Qj)S#7j}n@8seZb`jPPoeG);SR1rttBi(HBY+lx zdX$rH?C=hgQYX5{J@KEz=nkCyS)Or?jrUxEzk3IIxA#hDq)X~d-HKr9a?6maHo}*?{45j=^ zandWKT|I#}{_hZfo_TWPY-*F*&J3F}a`dB8pLq(*->Im47p6~wU!qdKaia*6E3?>s z)${!(^TEIG;%b*llr4Wed=sHLb`50$>Bv{IjxHr%Qt{^X|BmH~QM}253s=9py*=5} z5N)LAtyhU(#bWjoOV?Z$T1z#ksV=>^$dTb46kW|iC5dtpA;uA5oi~qtL|}e%Zppv) zWZeqBsrsr98jpSy+3Yew#P+7yT7D%=ky; z1y3jBD_GP5>meGRd6#Ho#KN0lO3iTVL2md&$(*$fo{&@bv3)$A{NANlC|*iCQ|u97 zbXq+HTRo^&;9VYKu6|Z0Pu)60Gx7*nSPy^h*rSBo8H8^cWdiCwcuD-fVMUn# zf3YGQZ2xbph}}G2I6!(;_S#Oi- zabm9FF(8e~@2v@9KA3$eK~muzSQ|dAK^T5*FRS2L=P&H>4)HXXFg~Fewx} z$`2tO>#xE68UzZ?J20qF8*UCUOpJ%ml#7ROM2|OtKV&6hZG-KGzJ=xbVUf>5$M3bM z;9u3WpHe-9dR+aUJ?!S8g;hsJMnu2`!!W=Kkw6MLUbf)tSYp_VW-G+WBqhpy+-5k$ zF^Dm1xP=>Yzyt5Z5hIcZ9dArbRhd#40k*IyTt=Ujh2CeR9E*gR#ttpnT*=My_Qc|^ zZ&GnC;89U>a%o=FS9ir6YdUE50s4sf8Mm>}b&f}K)JoY-L4b@+RjJ8dH)NDe{tJ^21j6O zxA_*7V8i6e!>vx^x2B1Rxw)nz%HR7lti+BxQM@9Q_Jw3;sQ?%o;i53$g+rx#7&;1e@VZae%=c*I!29$~G(I=S=K%YR*3a|Ti(lm1*XgwxkU{}!^_EUGE_p^Lo zY?>Y8s^V$y4BN-~A{gi?~%hs!_#oji&>nch3 zdL*6)Dks{9Q(4Vx;^K5n`WKy>Tki6SJo`4%^G?!J`YQuK_|5h>ELO-h;})-`MHw(- z3*w1?;QK>}n5uV@XT2bjO1&ke@1+QZP`HvR^w$}^V;JPbmG^^&L}|8TQYhouTAK_{ zS7S&WOV{C2BYutQvOjix1C@AX3C7s=BYFl){-7Fu%-8|X@-;PNE%tUSnMP}y%SFaA z`G1S+q_m2@V$Cykeb1GMLEQC!DYDLm6)$97~9tz274kx@PM?t&JrD!3@Z)}0UxLN zHIa1!UdK^jIde!D6_bSU{jPf(XSTd3@n8zJ)19Mk&kE#<5Vv#7|lTr-i@XnXGkDpwZzA!&fP2^MuTWRq}RNltKoJm+eW+ZJiy&o(^`EG^RXT)>(uJ2H#2pR zg||~S7>EP$X_dF3mCW1owiT%n5gxM*T3VYX}fLqTP76{&k8g7j|T3!Jr83W9*F6i0rBzbF7%`J3}uBGYeyg| z#X?39-&2Ne^E zZ#-~mq*bK(eWojaob<|+m36f?)N5dLCoFZuy5oOE1{Vh^DE&E5#_swf##Rq33=IQV(?5D%34RLxSfkoLg~p)HdD9$f*Zf1Sli6o zCf6wIhfb$%kmIkBCg)*yh3rM$aWinROT^Lc3lOXXTKlQ9>E1Xo*RZ53J)q|EeX-^E z2Fdx8bpY*I`~^fkhcORnNJ6f|fk(s*L;iG5R5_;aFP`A5j-9I?=hR49TT;;s5UnDZ+daT8+u@5mZ@+qwk|t6fO0x|Dsj zU31HX!;E$5gikO;SOwx7x?r;YhMOz%D+u0c#$PucrrjH8o(|Cg)b9Zp2*J6rNB)C# zaohBkl_`pbev;7;vZxgb7e4TrBPip248-Y;^3QEr#4<9n+l=HARpSG6g>g~_i0Fnh zm~Kkad)*=uXm_%_M!MBYh8`_I%7L<)ykV3k+ELy)gC$sAi9sxRd$*$*X7T}p)I$-h zQ^jx_EeNWJ$>E5fPYHqpGThL)VG2B|=`jE(R=ro&8%uL^3FINpx*0B1H@59EL)B`kF%d$hWb@k&S`JsRC z#;ANO+~W@Xe>t6A;`AR=iqW_~<(k6um4)st0wHcj4RuEbde}g;fZ;B}%$fnS%amzL zAJzK$($o3cKSxs~HP9zD&uZzlYCTn-Z*9{)xu5)6lls-}Lf}TLL?X#8EFEo*HN>nv zdfK=l;{wSj;c&DMU;<(f0-qB&zZ#S}7iCg3Zo6F4yB zxdI1~ZGN&0Tbo3Hh|Dx&2#G;^C!8!o0*nr8^y-0EgFPGw3X4*jbdoa`82#m$_ylv1 z#jbl`#Q1k+k5NnRHEMQ`I^PJP3`abl&=jeIHIMEoI`|NeVG+ z*(>7Bg@YP$=E#=?!E3f)TC$EeE#;+hm!_AtAIr7nNtG4Ei;vJQ3GOVUpJpa|XFO3fC99+^ z{(X?+@|ghTZ*2oLxj`=942K$(IgUrQ zYCO(XgcpsrR@oI*qH8Jj(i}*GdnWe53rRXC&!kbf9*~-M^C6Mf&tI5#*eL1^Q<)2T zmtf=caiXghH1n7;>nyM=vSAS#?%d`wKafusl7$m9iLo1l8&DO^twA}?dEhfS`+`O` z5xK|YxMW_d5N?kz=0#;B?d6&zsaX?0N=4abC~-mPDk@@BlFZ@{r1d(DU}#SKyd z?EJ?g(&E=a-5(B-U%;@Jmi_q-3Vpu1cMk>@i73pC{nviLdBW`@_VAljuh(1w^fG@7 z++H7&tb0rRd;#YNISFA0LmLZp)TA`@51IW0c@9!jANu_%Uvh%**~$bCs>3H0KkSVB ztZ!0XTqF3~_?bBLn(wyyM4b$+R|{Mr$j&#!EkneDh{y#7LH<%}rYO=Ih|5QM6L?i# zxC-AoCBHrTx&td_hEp2&znd@K_@VGP-36I|ga&zuUIE+;NSVd)mpFyTC zOnkr^zMr8d$ej_T?;N<`6Am_Sxo0>w&#v|kwwHch<`CkQ+PrqSoj;{b3PN5nthc{Wj1PmD;9?(eye$Z7@=^?zxE)9bg3OKT-a z5-@Op1cM21tyT<25FiuCCv(KqM_}^S|F)>&2IS4+x?A}AIvy0Qm_U;dO#5XR_>#$b zYN;&I!gKobRQ2Mgn@(PwNi&}9FUr}LidP>Ie8PL?P6Wco^oJK1g4-6A9V|l?f{Nk( zegU%T%I6uE?zx-<`M_mb+nac>UeYi#zSmx@G|DjCv1iCuh_nOM8OH z=i&8=oF{l#iQzg$L-A8k`f9U?Pq3M7jID0G3Pb?zZpaG7qKRPdBCtoULSz0q*IkWW zj+;82Zf{G7S8IPugpEKs^DaJJD@htm1F2}Y^^+&Z5v`R_Jf&3ZHyjWQyx}V53bvJY zl(z~zCte?%q7ezH{L`V7SIqY>Ij?a!5UBq4D&XXVBgKPxe4%)g=KRohlf&6PpC|o& zQw%kxdl58DEgigxV-R5|UGG1iRLqPMZuoR0V|)z*GW**X2w%L!%P>K+)d&?>(V%w7 z1Z3`}2a8%SR-nL>5#x$eE)9S_UNJ6772GD#2@#qrmwZ24Kzz1;(;5+%?`_BsD!{Ol#sQ~zu;I#q$1MCz31NIdk z{DT1kp!Ycj9t{}@scB}a9T*au=tV#R8CjSSor_!Fnw(Zb*p!o7Ts#eMenZ7V0GK=i zzYYSX(Lbk1U#B1tI^C85Ia>grt;1y$zdr!9O90(9*Chbq59(JSpan3CehSf=?`uWq z4T<_47#$Q75`tq8=a=D1@K6Zhn-Y7Z=licc6OaUgu25o3JB=7wq+f zkaG-6RXL^I>L3k=!y`q()xD1vMy}VIx!o7m5_? zs1USz@ZM7h`f%_*`*6b_4Kx^iIez@^PFzC_hK5-I9gDtRpy(zI{TEhk&qUne&c*5L zU;Bjoem9I5(~1~L6hqCA?v|55wg~n2>Rrv{!}UW`&%f5P@xyyhK7xetpSSz#-kQ=E z3@KO>>fh3vW%tGKAY*RQL2FKWBB&XImz++kJ$-sRy%oP=6@pXsbr(WpV>Y_@;z0kZ zC(GS9fbuA*pCJw8;|l*39>B_s+Kll58TJ9?&Eo6%zm89fUZ&giG)kz{cmlbtqTB?6 zonDPgSrw#-;=~;ODLs2-AEKgK*$Au%J0y}IiHtVk^T?1?P;6ayNt&A-HGikYHDQ7H zg-#_=rLx3{APu8mC|r^`C9ov0>UFCmi7mfu@jUC=jE{+>YS|s=82{ohO_4WV+X+O0 zBvz38{>>o%{CEIrDa!Lj{)FI`2$v`+{4ZC3dOQp0RmD|<^8fu&_1mTJSOKuLY6TkP z-qqj56H@rB0FX4O7`XZRURnnq=rvee{iEt5czLOCeZ3|rfxme#s$qd%nAaXs3)!7; z+g?8>edp7!=$8lH3izLEMWIqv8tm+fsyNuaoGVaE!NzLQFRna#`0D48CZ3SlGAk~s2Iq$HtsjE^7nY+-Pk09wSW)H-g`|9ts)v*AGa(bXZ} zQPhWoxsH1P`_1*;g*q~^47TIjWCSM+rH&*+spd;F{NDfiH2rYT5oL#>W{P-x){FC} z1Q5@z9x7@gK7ngf5;4~N@YfSgy-vJxx!LXz;+Xnd*bBx;$^RB2?I4;iQjUjY>sIN5 zu$p*u)XQXOVu`?xw)qOfuI9pD4s(;rAxRv^!3@(L7t@l~#V;!+qE@vc-MFt@p$=U9 z(U)t>1FBgS8f9y)>T25dGtik7)V`oe@%M#;>JxGA%C4VOp_QdqLQ@*He+=*@o%yQ*QhQ4k|#ShYR9Amn0+;I zP(_p69tAE+$7DEIJ~#dUn)Xe^K{>Wk|4TK=5>!zww>QaD4VHgYg5;^9MSe$SlQxi2 ziw?q>Mu6d8U@89g zSnDPV@_5WG(8m`skkKDMfOSyX2{}+Rn%FPTKHz{FcgFjR{FP%hvR9s6PQhY-BAcP9K0rb5!sY(W3kP# zY}w34AAXy?(RpJuGAX0SW?Cg8a ztUGVthx6sm{qVo%-ua(9_ssp_wBEQn?d=U`3n#>O)ff>&#Y0pNHrlNd@^oZ>a*wT7 zH>x<-b}K7Lp6>pz>sA)#q&rf(`Icxw|H3tkg!0Y7#b_m=y#DjemhvIDc@xST`xc`2 zYvuN9G{P%0mo{9BPF24^)8#1z9GHa7zU%;|X-zRi_eNk2jB;e5@8Q>@XIW~#WUaK) z#uj573!`kR6AV359f7V1jzGTzofXYd4LmoNwVIbOOReucPflki{d5zZ?Jhsf*bZ4G zKcLB({wi`kuDOS1GgB8SW0zzq-zj2u4%^Or$;dL-Jbl@1Z9??w$5pHmpRB&;>-kx; zi)|eb2yrWeZT-oHR8hap> zy5zho|G0+tjsRgV0FZHv8@${Q=p=(vPG|etI;T{I*bz>m@F*1k008rX0hIf!SYJvc zWu!MRVO~tm&I0c2o`qkWd9ZfI;#q6z^w3VF0dbyHk^ej;j`ZbK$tLzzS$E>bq5Rh` z*6T(qx)U_}*V=zq2+x0z9Y<_h+0vyA_wL!lSBBRAPS&9RrlSUhLBGAyV4b=nXo@1t z1d6Xxt+->S2bNX$eoU3J&dXaXi9}4dkG~>ywvKzT6F2WHJ<#%VPDK!=5Y+Q#DxHypF+ZP|Fl1tljJaaM{g}K;6d0 zVQVjpsB!jou8|ABldP}!lz+YeTAzv`k`y6&f1ybbV34CC>`$@@I)LCx3ja2VFt`WU z3G57pyMjHycbr;&9*;xm=xTUyhnxiOw6^{{{_(_+gHCQPe{041iFG>(VNN&9l~OZo zZWq8oYVLR80K_I;v){7z>>5i(eMjt9;>!OLi$MG(76O4AaOFav|D?Ck(>s!Z5J_2U zo(oJr139MRD~6M9spyDi9L*t;3eY$r?qI+I-eWAj@_l=__idGuPzcZ`&oJ*iUJK{gYx%TjXvuyPic`YnQA_P(RIzu+s=N4n zRFz@GWZ+x!$~D4ysUsu)msIO!v(m2Cu=W^&k`7jcCh1l@d7e-|`T8z{U6POf zk;)iz<+zzYyv0`bY3)|q*5bsQgYSuf)`X*tjw&2sbEgHbb=~0KIOOcD7_9au2LY;?7 z!x7t8w^@Q5624e1Lr9b+GjXa<(Pc9I-jq#qktq|-S5&Tt*$PWaqh*p78!O~*c-dr| zlBpPPIAA# zTaR@R%yU$-?Db_m*H41O@}z5f$%xZP>D~}TIxsmcGD=u2N+^UaH{#ECIBK6~)B&N# z;53v>zmUZbijO2<9T^%!xb^{j@#(BoV7j!JEzm&(I^dRz%8aXHElB`^idW&8NdwnT zD=8UR)u{>!3^hwEv-kAOiAu3$Qvxf0k7NUP>XW04M|j7pSrG=kQpvKc!>i5*La-7> zH-811^rZVBm|re%=!eHY^Q6+uw1#Tn&n=nh9erOM%@lvDUGEzCWt5U_sn#J5^oD~J z&;G_V8G;;3Wb=rte+aC&;hpPMf7{W6&Fjsx|LCX8hMD2`&k9*`B|=O*VRa&(1F5@G zI&>hN0vp<2ka?AIe#+Vgm<|PJMK`IMjeD_fr<_lzcqvk!sl!{SOWl0DeUCzDKoi{r z)*Uut@3l#oXIk+3;hAoMfObq literal 0 HcmV?d00001 From cba81ce061df867936314dcbdbc565248c8db006 Mon Sep 17 00:00:00 2001 From: deiflaender Date: Thu, 11 Aug 2022 14:18:36 +0200 Subject: [PATCH 2/2] RED-3974: Refactored processTablePerRow --- .../classification/model/SectionText.java | 6 +- .../redaction/model/SearchableText.java | 2 +- .../service/SectionTextBuilderService.java | 106 ++++++++++-------- 3 files changed, 68 insertions(+), 46 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java index cf6ced53..7077e98c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java @@ -27,11 +27,15 @@ public class SectionText { private boolean isTable; private String headline; + @Builder.Default private List sectionAreas = new ArrayList<>(); + @Builder.Default private Set images = new HashSet<>(); - + @Builder.Default private List textBlocks = new ArrayList<>(); + @Builder.Default private Map tabularData = new HashMap<>(); + @Builder.Default private List cellStarts = new ArrayList<>(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index f7f63e07..4503f5d6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -194,7 +194,7 @@ public class SearchableText { } - public String buildString(List sequences) { + public static String buildString(List sequences) { StringBuilder sb = new StringBuilder(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java index 1d83a4b5..7034753e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java @@ -77,65 +77,48 @@ public class SectionTextBuilderService { private List processTablePerRow(Table table, AtomicInteger sectionNumber) { List sectionTexts = new ArrayList<>(); + boolean hasHeader = hasTableHeader(table); - boolean hasHeader = table.getRows() - .stream() - .anyMatch(row -> row.stream() - .anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty())); + for (List row : table.getRows()) { - for (int rowNum = 0; rowNum < table.getRows().size(); rowNum++) { - SearchableText searchableRow = new SearchableText(); + List textBlocks = new ArrayList<>(); + List areas = new ArrayList<>(); Map tabularData = new HashMap<>(); - int start = 0; - List cellStarts = new ArrayList<>(); - SectionText sectionText = new SectionText(); - for (int cellNum = 0; cellNum < table.getRows().get(rowNum).size(); cellNum++) { - Cell cell = table.getRows().get(rowNum).get(cellNum); + List startOffsets = new ArrayList<>(); + int startOffset = 0; + for (int cellNum = 0; cellNum < row.size(); cellNum++) { + + Cell cell = row.get(cellNum); if (CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } - SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() - .get(0) - .getSequences() - .get(0) - .getPage(), null); - sectionText.getSectionAreas().add(sectionArea); - sectionText.getTextBlocks().addAll(cell.getTextBlocks()); - - int cellStart = start; + SectionArea sectionArea = getSectionArea(cell); + areas.add(sectionArea); if (!cell.isHeaderCell()) { - if (hasHeader) { - cell.getHeaderCells().forEach(headerCell -> { - String headerName = buildHeaderName(headerCell); - sectionArea.setHeader(headerName); - tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); - }); - } else { - String headerName = buildHeaderName(table.getRows().get(0).get(cellNum)); - sectionArea.setHeader(headerName); - tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); - } + String headerName = getHeaderName(hasHeader, cell, table.getRows().get(0).get(cellNum)); + sectionArea.setHeader(headerName); + tabularData.put(headerName, new CellValue(cell.getTextBlocks(), startOffset)); } - for (TextBlock textBlock : cell.getTextBlocks()) { - // TODO avoid cell overlap merging. - searchableRow.addAll(textBlock.getSequences()); - } - cellStarts.add(cellStart); - start = start + cell.toString().trim().length() + 1; + textBlocks.addAll(cell.getTextBlocks()); + startOffsets.add(startOffset); + startOffset = startOffset + cell.toString().trim().length() + 1; } - sectionText.setText(searchableRow.toString()); - sectionText.setHeadline(table.getHeadline()); - sectionText.setSectionNumber(sectionNumber.intValue()); - sectionText.setTable(true); - sectionText.setTabularData(tabularData); - sectionText.setCellStarts(cellStarts); - sectionTexts.add(sectionText); + sectionTexts.add(SectionText.builder() + .text(getRowText(textBlocks)) + .headline(table.getHeadline()) + .sectionNumber(sectionNumber.intValue()) + .isTable(true) + .tabularData(tabularData) + .cellStarts(startOffsets) + .textBlocks(textBlocks) + .sectionAreas(areas) + .build()); sectionNumber.incrementAndGet(); } @@ -144,6 +127,41 @@ public class SectionTextBuilderService { } + public String getRowText(List rowTextBlocks) { + + return SearchableText.buildString(rowTextBlocks.stream() + .map(textBlock -> textBlock.getSequences()) + .flatMap(List::stream) + .collect(Collectors.toList())); + } + + + private boolean hasTableHeader(Table table) { + + return table.getRows() + .stream() + .anyMatch(row -> row.stream() + .anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty())); + } + + + private SectionArea getSectionArea(Cell cell) { + + return new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + .get(0) + .getSequences() + .get(0) + .getPage(), null); + } + + + private String getHeaderName(boolean hasHeader, Cell currentCell, Cell cellInFirstRow) { + + return hasHeader ? buildHeaderName(currentCell.getHeaderCells() + .get(currentCell.getHeaderCells().size() - 1)) : buildHeaderName(cellInFirstRow); + } + + private String buildHeaderName(Cell cell) { StringBuilder headerBuilder = new StringBuilder();