diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java index cf6ced53..7077e98c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java @@ -27,11 +27,15 @@ public class SectionText { private boolean isTable; private String headline; + @Builder.Default private List sectionAreas = new ArrayList<>(); + @Builder.Default private Set images = new HashSet<>(); - + @Builder.Default private List textBlocks = new ArrayList<>(); + @Builder.Default private Map tabularData = new HashMap<>(); + @Builder.Default private List cellStarts = new ArrayList<>(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index f7f63e07..4503f5d6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -194,7 +194,7 @@ public class SearchableText { } - public String buildString(List sequences) { + public static String buildString(List sequences) { StringBuilder sb = new StringBuilder(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java index fb4b0e81..7034753e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java @@ -1,19 +1,35 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import com.iqser.red.service.redaction.v1.model.Point; -import com.iqser.red.service.redaction.v1.model.SectionArea; -import com.iqser.red.service.redaction.v1.server.classification.model.*; -import com.iqser.red.service.redaction.v1.server.redaction.model.*; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + import org.apache.commons.collections4.CollectionUtils; import org.springframework.stereotype.Service; -import java.util.*; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; +import com.iqser.red.service.redaction.v1.model.Point; +import com.iqser.red.service.redaction.v1.model.SectionArea; +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.Footer; +import com.iqser.red.service.redaction.v1.server.classification.model.Header; +import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; +import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText; +import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; +import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; +import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; @Slf4j @Service @@ -35,8 +51,7 @@ public class SectionTextBuilderService { } sectionNumber.incrementAndGet(); } - sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph - .getImages())); + sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph.getImages())); sectionNumber.incrementAndGet(); } @@ -62,58 +77,48 @@ public class SectionTextBuilderService { private List processTablePerRow(Table table, AtomicInteger sectionNumber) { List sectionTexts = new ArrayList<>(); - for (List row : table.getRows()) { - SearchableText searchableRow = new SearchableText(); - Map tabularData = new HashMap<>(); - int start = 0; - List cellStarts = new ArrayList<>(); - SectionText sectionText = new SectionText(); - for (Cell cell : row) { + boolean hasHeader = hasTableHeader(table); + for (List row : table.getRows()) { + + List textBlocks = new ArrayList<>(); + List areas = new ArrayList<>(); + Map tabularData = new HashMap<>(); + List startOffsets = new ArrayList<>(); + + int startOffset = 0; + for (int cellNum = 0; cellNum < row.size(); cellNum++) { + + Cell cell = row.get(cellNum); if (CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } - SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell - .getWidth(), (float) cell.getHeight(), cell.getTextBlocks() - .get(0) - .getSequences() - .get(0) - .getPage(), null); - sectionText.getSectionAreas().add(sectionArea); - sectionText.getTextBlocks().addAll(cell.getTextBlocks()); - - int cellStart = start; + SectionArea sectionArea = getSectionArea(cell); + areas.add(sectionArea); if (!cell.isHeaderCell()) { - cell.getHeaderCells().forEach(headerCell -> { - StringBuilder headerBuilder = new StringBuilder(); - headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); - String headerName = headerBuilder.toString() - .replaceAll("\n", "") - .replaceAll(" ", "") - .replaceAll("-", ""); - sectionArea.setHeader(headerName); - tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); - }); + String headerName = getHeaderName(hasHeader, cell, table.getRows().get(0).get(cellNum)); + sectionArea.setHeader(headerName); + tabularData.put(headerName, new CellValue(cell.getTextBlocks(), startOffset)); } - for (TextBlock textBlock : cell.getTextBlocks()) { - // TODO avoid cell overlap merging. - searchableRow.addAll(textBlock.getSequences()); - } - cellStarts.add(cellStart); - start = start + cell.toString().trim().length() + 1; + textBlocks.addAll(cell.getTextBlocks()); + startOffsets.add(startOffset); + startOffset = startOffset + cell.toString().trim().length() + 1; } - sectionText.setText(searchableRow.toString()); - sectionText.setHeadline(table.getHeadline()); - sectionText.setSectionNumber(sectionNumber.intValue()); - sectionText.setTable(true); - sectionText.setTabularData(tabularData); - sectionText.setCellStarts(cellStarts); - sectionTexts.add(sectionText); + sectionTexts.add(SectionText.builder() + .text(getRowText(textBlocks)) + .headline(table.getHeadline()) + .sectionNumber(sectionNumber.intValue()) + .isTable(true) + .tabularData(tabularData) + .cellStarts(startOffsets) + .textBlocks(textBlocks) + .sectionAreas(areas) + .build()); sectionNumber.incrementAndGet(); } @@ -122,6 +127,49 @@ public class SectionTextBuilderService { } + public String getRowText(List rowTextBlocks) { + + return SearchableText.buildString(rowTextBlocks.stream() + .map(textBlock -> textBlock.getSequences()) + .flatMap(List::stream) + .collect(Collectors.toList())); + } + + + private boolean hasTableHeader(Table table) { + + return table.getRows() + .stream() + .anyMatch(row -> row.stream() + .anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty())); + } + + + private SectionArea getSectionArea(Cell cell) { + + return new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + .get(0) + .getSequences() + .get(0) + .getPage(), null); + } + + + private String getHeaderName(boolean hasHeader, Cell currentCell, Cell cellInFirstRow) { + + return hasHeader ? buildHeaderName(currentCell.getHeaderCells() + .get(currentCell.getHeaderCells().size() - 1)) : buildHeaderName(cellInFirstRow); + } + + + private String buildHeaderName(Cell cell) { + + StringBuilder headerBuilder = new StringBuilder(); + cell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); + return headerBuilder.toString().replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", ""); + } + + private SectionText processTableAsOneText(Table table, AtomicInteger sectionNumber) { SearchableText entireTableText = new SearchableText(); @@ -130,14 +178,12 @@ public class SectionTextBuilderService { List cellStarts = new ArrayList<>(); for (List row : table.getRows()) { - for (Cell cell : row) { if (CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } - SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell - .getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() .get(0) .getSequences() .get(0) @@ -149,7 +195,6 @@ public class SectionTextBuilderService { entireTableText.addAll(textBlock.getSequences()); } - cellStarts.add(start); start = start + cell.toString().trim().length() + 1; } @@ -170,8 +215,7 @@ public class SectionTextBuilderService { SectionText sectionText = new SectionText(); for (TextBlock paragraphTextBlock : paragraphTextBlocks) { - SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock - .getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null); + SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null); sectionText.getSectionAreas().add(sectionArea); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 6b97870d..fc3debce 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -882,6 +882,36 @@ public class RedactionIntegrationTest { } + @Test + public void testTableHeader() throws IOException { + + System.out.println("testTableHeader"); + long start = System.currentTimeMillis(); + + AnalyzeRequest request = prepareStorage("files/Minimal Examples/NoHeaderTable.pdf"); + analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); + AnalyzeResult result = analyzeService.analyze(request); + + AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder() + .dossierId(TEST_DOSSIER_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) { + fileOutputStream.write(annotateResponse.getDocument()); + } + + var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); + assertThat(redactionLog.getRedactionLogEntry().size()).isEqualTo(5); + + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + System.out.println("numberOfPages: " + result.getNumberOfPages()); + } + + + @Test public void testFindDictionaryEntryInResizedEntryPosition() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 35d751b7..8e7122a6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -381,4 +381,13 @@ rule "30: Ignore dossier_redactions if confidential" Section(!fileAttributeByLabelEqualsIgnoreCase("Confidentiality","confidential") && matchesType("dossier_redactions")); then section.ignore("dossier_redactions"); + end + +// ex. "New Rules for PAD" - "Annex A" - page 21, page 35 (table without header), page 38 (in-text) +// https://www.regexplanet.com/share/index.html?share=yyyypb71xkr +rule "101: Redact CAS numbers" + when + Section(hasTableHeader("Sample #")) + then + section.redactByRegEx("\\b[1-9]{1}[0-9]{1,5}-\\d{2}-\\R?\\d{1,2}\\b", true, 0, "PII", 101, "compound/sample identifier", "Article 4(2) first indent of Regulation No. 1049/2001"); end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/NoHeaderTable.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/NoHeaderTable.pdf new file mode 100644 index 00000000..3a1e959a Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/NoHeaderTable.pdf differ