diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java new file mode 100644 index 00000000..e646cbef --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java @@ -0,0 +1,16 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; + +import lombok.RequiredArgsConstructor; +import lombok.Value; + +@Value +@RequiredArgsConstructor +public class CellValue { + + TextBlock textBlock; + + int rowSpanStart; + +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 3a517ef2..feee9f99 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -9,8 +9,6 @@ import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; - import lombok.Builder; import lombok.Data; import lombok.extern.slf4j.Slf4j; @@ -32,7 +30,7 @@ public class Section { private int sectionNumber; - private Map tabularData; + private Map tabularData; public boolean rowEquals(String headerName, String value){ @@ -40,7 +38,8 @@ public class Section { .replaceAll(" ", "") .replaceAll("-", ""); - return tabularData != null && tabularData.containsKey(cleanHeaderName) && tabularData.get(cleanHeaderName).getText().equals(value); + return tabularData != null && tabularData.containsKey(cleanHeaderName) + && tabularData.get(cleanHeaderName).getTextBlock().getText().equals(value); } @@ -177,15 +176,18 @@ public class Section { .replaceAll(" ", "") .replaceAll("-", ""); - TextBlock value = tabularData.get(cleanHeaderName); + CellValue value = tabularData.get(cleanHeaderName); if (value == null) { log.warn("Could not find any data for {}.", cellHeader); } else { - Entity entity = new Entity(value.getText(), type, 0, value.getText().length(), headline, sectionNumber); + Entity entity = new Entity(value.getTextBlock() + .getText(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.getTextBlock() + .getText() + .length(), headline, sectionNumber); entity.setRedaction(false); entity.setMatchedRule(ruleNumber); entity.setRedactionReason(cellHeader); - entity.setTargetSequences(value.getSequences()); // Make sure no other cells with same content are highlighted + entity.setTargetSequences(value.getTextBlock().getSequences()); // Make sure no other cells with same content are highlighted entities.add(entity); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index b00a59b8..a5e7e0f4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -18,6 +18,7 @@ import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; @@ -53,26 +54,27 @@ public class EntityRedactionService { for (Table table : tables) { for (List row : table.getRows()) { SearchableText searchableRow = new SearchableText(); - Map tabularData = new HashMap<>(); + Map tabularData = new HashMap<>(); + int start = 0; for (Cell cell : row) { if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber); + int cellStart = start; cell.getHeaderCells().forEach(headerCell -> { - StringBuilder headerBuilder = new StringBuilder(); headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); String headerName = headerBuilder.toString() .replaceAll("\n", "") .replaceAll(" ", "") .replaceAll("-", ""); - tabularData.put(headerName, cell.getTextBlocks().get(0)); + tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart)); }); + start = start + cell.getTextBlocks().get(0).toString().length(); for (TextBlock textBlock : cell.getTextBlocks()) { searchableRow.addAll(textBlock.getSequences()); } - } Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber); @@ -142,7 +144,7 @@ public class EntityRedactionService { private Set findEntities(SearchableText searchableText, String headline, int sectionNumber) { Set found = new HashSet<>(); - if (StringUtils.isEmpty(searchableText.toString()) && StringUtils.isEmpty(headline)) { + if (StringUtils.isEmpty(searchableText.toString())) { return found; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 11d5d5d4..4862ac0d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -85,8 +85,7 @@ public class SectionsBuilderService { }).collect(Collectors.toList()); } if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { - for (int i = currentTable.getRows() - .size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table List row = currentTable.getRows().get(i); if (row.size() == tableNonHeaderRow.size() && row.stream() .allMatch(cell -> cell.getHeaderCells().isEmpty())) { @@ -185,7 +184,7 @@ public class SectionsBuilderService { private List getRowWithNonHeaderCells(Table table) { - for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table List row = table.getRows().get(i); boolean allNonHeader = true; for (Cell cell : row) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java index c118d0e0..36f46b39 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java @@ -29,11 +29,13 @@ public class Table extends AbstractTextContainer { @Setter private String headline; - @Getter - private int rowCount; + private int unrotatedRowCount; - @Getter - private int colCount; + private int unrotatedColCount; + + private int rowCount = -1; + + private int colCount = -1; private final int rotation; @@ -65,6 +67,25 @@ public class Table extends AbstractTextContainer { } + public int getRowCount() { + + if (rowCount == -1) { + rowCount = getRows().size(); + } + return rowCount; + } + + + public int getColCount() { + + if (colCount == -1) { + colCount = getRows().stream().mapToInt(List::size).max().orElse(0); + } + return colCount; + + } + + /** * Detect header cells (either first row or first column): * Column is marked as header if cell text is bold and row cell text is not bold. @@ -72,100 +93,54 @@ public class Table extends AbstractTextContainer { */ private void computeHeaders() { + if (rows == null) { + rows = computeRows(); + } // A bold cell is a header cell as long as every cell to the left/top is bold, too - cells.forEach((position, cell) -> { - List cellsToTheLeft = getCellsToTheLeft(position); - Cell lastHeaderCell = null; - for (Cell leftCell : cellsToTheLeft) { - if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks() + // we move from left to right and top to bottom + for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { + List rowCells = rows.get(rowIndex); + for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) { + Cell cell = rowCells.get(colIndex); + List cellsToTheLeft = rowCells.subList(0, colIndex); + Cell lastHeaderCell = null; + for (Cell leftCell : cellsToTheLeft) { + if (leftCell.isHeaderCell()) { + lastHeaderCell = leftCell; + } else { + break; + } + } + if (lastHeaderCell != null) { + cell.getHeaderCells().add(lastHeaderCell); + } + List cellsToTheTop = new ArrayList<>(); + for (int i = 0; i < rowIndex; i++) { + try { + cellsToTheTop.add(rows.get(i).get(colIndex)); + } catch (IndexOutOfBoundsException e) { + log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex); + } + } + for (Cell topCell : cellsToTheTop) { + if (topCell.isHeaderCell()) { + lastHeaderCell = topCell; + } else { + break; + } + } + if (lastHeaderCell != null) { + cell.getHeaderCells().add(lastHeaderCell); + } + if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks() .get(0) .getMostPopularWordStyle() .equals("bold")) { - lastHeaderCell = leftCell; - } else { - break; + cell.setHeaderCell(true); } } - if (lastHeaderCell != null) { - cell.getHeaderCells().add(lastHeaderCell); - } - lastHeaderCell = null; - List cellsToTheTop = getCellToTheTop(position); - for (Cell topCell : cellsToTheTop) { - if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks() - .get(0) - .getMostPopularWordStyle() - .equals("bold")) { - lastHeaderCell = topCell; - } else { - break; - } - } - if (lastHeaderCell != null) { - cell.getHeaderCells().add(lastHeaderCell); - } - if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks() - .get(0) - .getMostPopularWordStyle() - .equals("bold")) { - cell.setHeaderCell(true); - } - }); - - } - - - private List getCellsToTheLeft(CellPosition cellPosition) { - - List result = new ArrayList<>(); - if (cellPosition.getCol() == 0) { - return result; - } - int row = cellPosition.getRow(); - for (int i = cellPosition.getCol() - 1; i >= 0; i--) { - if (cells.get(new CellPosition(row, i)) != null) { - result.add(cells.get(new CellPosition(row, i))); - } else { - Cell spanningCell = null; - while (spanningCell == null && row >= 0) { - row--; - spanningCell = cells.get(new CellPosition(row, i)); - } - if (spanningCell != null) { - result.add(spanningCell); - } - row = cellPosition.getRow(); - } } - Collections.reverse(result); - return result; - } - - private List getCellToTheTop(CellPosition cellPosition) { - - List result = new ArrayList<>(); - if (cellPosition.getRow() == 0) { - return result; - } - int col = cellPosition.getCol(); - for (int i = cellPosition.getRow() - 1; i >= 0; i--) { - if (cells.get(new CellPosition(i, col)) != null) { - result.add(cells.get(new CellPosition(i, col))); - } else { - Cell spanningCell = null; - while (spanningCell == null && col >= 0) { - col--; - spanningCell = cells.get(new CellPosition(i, col)); - } - if (spanningCell != null) { - result.add(spanningCell); - } - col = cellPosition.getCol(); - } - } - Collections.reverse(result); - return result; } @@ -173,9 +148,9 @@ public class Table extends AbstractTextContainer { List> rows = new ArrayList<>(); if (rotation == 90) { - for (int i = 0; i < colCount; i++) { // rows + for (int i = 0; i < unrotatedColCount; i++) { // rows List lastRow = new ArrayList<>(); - for (int j = rowCount - 1; j >= 0; j--) { // cols + for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols Cell cell = cells.get(new CellPosition(j, i)); if (cell != null) { lastRow.add(cell); @@ -184,9 +159,9 @@ public class Table extends AbstractTextContainer { rows.add(lastRow); } } else if (rotation == 270) { - for (int i = colCount - 1; i >= 0; i--) { // rows + for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows List lastRow = new ArrayList<>(); - for (int j = 0; j < rowCount; j++) { // cols + for (int j = 0; j < unrotatedRowCount; j++) { // cols Cell cell = cells.get(new CellPosition(i, j)); if (cell != null) { lastRow.add(cell); @@ -195,9 +170,9 @@ public class Table extends AbstractTextContainer { rows.add(lastRow); } } else { - for (int i = 0; i < rowCount; i++) { + for (int i = 0; i < unrotatedRowCount; i++) { List lastRow = new ArrayList<>(); - for (int j = 0; j < colCount; j++) { + for (int j = 0; j < unrotatedColCount; j++) { Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault() if (cell != null) { lastRow.add(cell); @@ -214,8 +189,8 @@ public class Table extends AbstractTextContainer { private void add(Cell chunk, int row, int col) { - rowCount = Math.max(rowCount, row + 1); - colCount = Math.max(colCount, col + 1); + unrotatedRowCount = Math.max(unrotatedRowCount, row + 1); + unrotatedColCount = Math.max(unrotatedColCount, col + 1); CellPosition cp = new CellPosition(row, col); cells.put(cp, chunk); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index da4ed03b..f29e83d3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -130,7 +130,7 @@ public class EntityRedactionServiceTest { Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); entityRedactionService.processDocument(classifiedDoc, null); assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1 + assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 names, 1 address, 1 Y and 2 N entities } } @@ -193,6 +193,7 @@ public class EntityRedactionServiceTest { } + @Test public void headerPropagation() throws IOException { @@ -219,6 +220,31 @@ public class EntityRedactionServiceTest { } + @Test + public void testNGuideline() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Empty Tabular Data.pdf"); + + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(Collections.singletonList("Aldershof S.")) + .build(); + + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6); + } + } + + @Before public void stubRedaction() { String tableRules = "package drools\n" + @@ -226,12 +252,20 @@ public class EntityRedactionServiceTest { "import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" + "\n" + "global Section section\n" + + "rule \"8: Not redacted because Vertebrate Study = N\"\n" + + " when\n" + + " Section(rowEquals(\"Vertebrate study Y/N\", \"N\"))\n" + + " then\n" + + " section.redactNot(\"name\", 8, \"Not redacted because row is not a vertebrate study\");\n" + + " section.redactNot(\"address\", 8, \"Not redacted because row is not a vertebrate study\");\n" + + " section.highlightCell(\"Vertebrate study Y/N\", 8, \"hint_only\");\n" + + " end\n" + "rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" + " when\n" + " Section(rowEquals(\"Vertebrate study Y/N\", \"Y\"))\n" + " then\n" + " section.redact(\"name\", 9, \"Redacted because row is a vertebrate study\");\n" + - " section.redact(\"address\", 9, \"Redacted because rows is a vertebrate study\");\n" + + " section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\");\n" + " section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" + " end"; when(rulesClient.getVersion()).thenReturn(1L); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java index 852dc91b..bee1719a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java @@ -146,4 +146,44 @@ public class PdfSegmentationServiceTest { } } + + @Test + public void testHeaderCellsForRotatedTable() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf"); + + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document document = pdfSegmentationService.parseDocument(pdDocument); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table firstTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(firstTable.getColCount()).isEqualTo(8); + assertThat(firstTable.getRowCount()).isEqualTo(1); + Table secondTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(1); + assertThat(secondTable.getColCount()).isEqualTo(8); + assertThat(secondTable.getRowCount()).isEqualTo(6); + List> firstTableHeaderCells = firstTable.getRows() + .get(0) + .stream() + .map(Collections::singletonList) + .collect(Collectors.toList()); + assertThat(secondTable.getRows().stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()) + .equals(firstTableHeaderCells))) + .isTrue(); + } + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Empty Tabular Data.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Empty Tabular Data.pdf new file mode 100644 index 00000000..e8b494c8 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Empty Tabular Data.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Rotated Table Headers.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Rotated Table Headers.pdf new file mode 100644 index 00000000..f4512499 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Rotated Table Headers.pdf differ