diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java index fb4b0e81..1d83a4b5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java @@ -1,19 +1,35 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import com.iqser.red.service.redaction.v1.model.Point; -import com.iqser.red.service.redaction.v1.model.SectionArea; -import com.iqser.red.service.redaction.v1.server.classification.model.*; -import com.iqser.red.service.redaction.v1.server.redaction.model.*; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + import org.apache.commons.collections4.CollectionUtils; import org.springframework.stereotype.Service; -import java.util.*; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; +import com.iqser.red.service.redaction.v1.model.Point; +import com.iqser.red.service.redaction.v1.model.SectionArea; +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.Footer; +import com.iqser.red.service.redaction.v1.server.classification.model.Header; +import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; +import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText; +import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; +import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; +import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; @Slf4j @Service @@ -35,8 +51,7 @@ public class SectionTextBuilderService { } sectionNumber.incrementAndGet(); } - sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph - .getImages())); + sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph.getImages())); sectionNumber.incrementAndGet(); } @@ -62,20 +77,26 @@ public class SectionTextBuilderService { private List processTablePerRow(Table table, AtomicInteger sectionNumber) { List sectionTexts = new ArrayList<>(); - for (List row : table.getRows()) { + + boolean hasHeader = table.getRows() + .stream() + .anyMatch(row -> row.stream() + .anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty())); + + for (int rowNum = 0; rowNum < table.getRows().size(); rowNum++) { SearchableText searchableRow = new SearchableText(); Map tabularData = new HashMap<>(); int start = 0; List cellStarts = new ArrayList<>(); SectionText sectionText = new SectionText(); - for (Cell cell : row) { + for (int cellNum = 0; cellNum < table.getRows().get(rowNum).size(); cellNum++) { + Cell cell = table.getRows().get(rowNum).get(cellNum); if (CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } - SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell - .getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() .get(0) .getSequences() .get(0) @@ -86,16 +107,17 @@ public class SectionTextBuilderService { int cellStart = start; if (!cell.isHeaderCell()) { - cell.getHeaderCells().forEach(headerCell -> { - StringBuilder headerBuilder = new StringBuilder(); - headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); - String headerName = headerBuilder.toString() - .replaceAll("\n", "") - .replaceAll(" ", "") - .replaceAll("-", ""); + if (hasHeader) { + cell.getHeaderCells().forEach(headerCell -> { + String headerName = buildHeaderName(headerCell); + sectionArea.setHeader(headerName); + tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); + }); + } else { + String headerName = buildHeaderName(table.getRows().get(0).get(cellNum)); sectionArea.setHeader(headerName); tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); - }); + } } for (TextBlock textBlock : cell.getTextBlocks()) { @@ -122,6 +144,14 @@ public class SectionTextBuilderService { } + private String buildHeaderName(Cell cell) { + + StringBuilder headerBuilder = new StringBuilder(); + cell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); + return headerBuilder.toString().replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", ""); + } + + private SectionText processTableAsOneText(Table table, AtomicInteger sectionNumber) { SearchableText entireTableText = new SearchableText(); @@ -130,14 +160,12 @@ public class SectionTextBuilderService { List cellStarts = new ArrayList<>(); for (List row : table.getRows()) { - for (Cell cell : row) { if (CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } - SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell - .getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() .get(0) .getSequences() .get(0) @@ -149,7 +177,6 @@ public class SectionTextBuilderService { entireTableText.addAll(textBlock.getSequences()); } - cellStarts.add(start); start = start + cell.toString().trim().length() + 1; } @@ -170,8 +197,7 @@ public class SectionTextBuilderService { SectionText sectionText = new SectionText(); for (TextBlock paragraphTextBlock : paragraphTextBlocks) { - SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock - .getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null); + SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null); sectionText.getSectionAreas().add(sectionArea); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 6b97870d..fc3debce 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -882,6 +882,36 @@ public class RedactionIntegrationTest { } + @Test + public void testTableHeader() throws IOException { + + System.out.println("testTableHeader"); + long start = System.currentTimeMillis(); + + AnalyzeRequest request = prepareStorage("files/Minimal Examples/NoHeaderTable.pdf"); + analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); + AnalyzeResult result = analyzeService.analyze(request); + + AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder() + .dossierId(TEST_DOSSIER_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) { + fileOutputStream.write(annotateResponse.getDocument()); + } + + var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); + assertThat(redactionLog.getRedactionLogEntry().size()).isEqualTo(5); + + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + System.out.println("numberOfPages: " + result.getNumberOfPages()); + } + + + @Test public void testFindDictionaryEntryInResizedEntryPosition() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 35d751b7..8e7122a6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -381,4 +381,13 @@ rule "30: Ignore dossier_redactions if confidential" Section(!fileAttributeByLabelEqualsIgnoreCase("Confidentiality","confidential") && matchesType("dossier_redactions")); then section.ignore("dossier_redactions"); + end + +// ex. "New Rules for PAD" - "Annex A" - page 21, page 35 (table without header), page 38 (in-text) +// https://www.regexplanet.com/share/index.html?share=yyyypb71xkr +rule "101: Redact CAS numbers" + when + Section(hasTableHeader("Sample #")) + then + section.redactByRegEx("\\b[1-9]{1}[0-9]{1,5}-\\d{2}-\\R?\\d{1,2}\\b", true, 0, "PII", 101, "compound/sample identifier", "Article 4(2) first indent of Regulation No. 1049/2001"); end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/NoHeaderTable.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/NoHeaderTable.pdf new file mode 100644 index 00000000..3a1e959a Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/NoHeaderTable.pdf differ