Pull request #33: Fix entity span in table rows and detection of headers in rotated tables

Merge in RED/redaction-service from bugfix/rowspan-and-header-in-rotated-table-fix to master * commit '4954aafed78e06484531d6264bdf215176ee0ef2': Reduce log level. Remove unused import Adjust test to added rule and fix vertical header propagation for row > 2 Fix entity span in table rows and detection of headers in rotated tables
2020-08-25 17:30:57 +02:00 · 2020-08-25 17:30:57 +02:00 · 81ea2e91ef
commit 81ea2e91ef
parent 6483e637c6 4954aafed7
9 changed files with 183 additions and 115 deletions
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java
@ -0,0 +1,16 @@
+package com.iqser.red.service.redaction.v1.server.redaction.model;
+
+import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
+
+import lombok.RequiredArgsConstructor;
+import lombok.Value;
+
+@Value
+@RequiredArgsConstructor
+public class CellValue {
+
+    TextBlock textBlock;
+
+    int rowSpanStart;
+
+}
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java
@ -9,8 +9,6 @@ import java.util.regex.Pattern;

 import org.apache.commons.lang3.StringUtils;

-import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
-
 import lombok.Builder;
 import lombok.Data;
 import lombok.extern.slf4j.Slf4j;
@ -32,7 +30,7 @@ public class Section {

    private int sectionNumber;

-    private Map<String, TextBlock> tabularData;
+    private Map<String, CellValue> tabularData;


    public boolean rowEquals(String headerName, String value){
@ -40,7 +38,8 @@ public class Section {
                .replaceAll(" ", "")
                .replaceAll("-", "");

-        return tabularData != null && tabularData.containsKey(cleanHeaderName) && tabularData.get(cleanHeaderName).getText().equals(value);
+        return tabularData != null && tabularData.containsKey(cleanHeaderName)
+                && tabularData.get(cleanHeaderName).getTextBlock().getText().equals(value);
    }


@ -177,15 +176,18 @@ public class Section {
                .replaceAll(" ", "")
                .replaceAll("-", "");

-        TextBlock value = tabularData.get(cleanHeaderName);
+        CellValue value = tabularData.get(cleanHeaderName);
        if (value == null) {
            log.warn("Could not find any data for {}.", cellHeader);
        } else {
-            Entity entity = new Entity(value.getText(), type, 0, value.getText().length(), headline, sectionNumber);
+            Entity entity = new Entity(value.getTextBlock()
+                    .getText(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.getTextBlock()
+                    .getText()
+                    .length(), headline, sectionNumber);
            entity.setRedaction(false);
            entity.setMatchedRule(ruleNumber);
            entity.setRedactionReason(cellHeader);
-            entity.setTargetSequences(value.getSequences()); // Make sure no other cells with same content are highlighted
+            entity.setTargetSequences(value.getTextBlock().getSequences()); // Make sure no other cells with same content are highlighted
            entities.add(entity);
        }

--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
@ -18,6 +18,7 @@ import com.iqser.red.service.redaction.v1.model.Rectangle;
 import com.iqser.red.service.redaction.v1.server.classification.model.Document;
 import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
 import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
+import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
 import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
 import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
 import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
@ -53,26 +54,27 @@ public class EntityRedactionService {
            for (Table table : tables) {
                for (List<Cell> row : table.getRows()) {
                    SearchableText searchableRow = new SearchableText();
-                    Map<String, TextBlock> tabularData = new HashMap<>();
+                    Map<String, CellValue> tabularData = new HashMap<>();
+                    int start = 0;
                    for (Cell cell : row) {
                        if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
                            continue;
                        }
                        addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
+                        int cellStart = start;
                        cell.getHeaderCells().forEach(headerCell -> {
-
                            StringBuilder headerBuilder = new StringBuilder();
                            headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
                            String headerName = headerBuilder.toString()
                                    .replaceAll("\n", "")
                                    .replaceAll(" ", "")
                                    .replaceAll("-", "");
-                            tabularData.put(headerName, cell.getTextBlocks().get(0));
+                            tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart));
                        });
+                        start = start + cell.getTextBlocks().get(0).toString().length();
                        for (TextBlock textBlock : cell.getTextBlocks()) {
                            searchableRow.addAll(textBlock.getSequences());
                        }
-
                    }
                    Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);

@ -142,7 +144,7 @@ public class EntityRedactionService {
    private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber) {

        Set<Entity> found = new HashSet<>();
-        if (StringUtils.isEmpty(searchableText.toString()) && StringUtils.isEmpty(headline)) {
+        if (StringUtils.isEmpty(searchableText.toString())) {
            return found;
        }

--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java
@ -85,8 +85,7 @@ public class SectionsBuilderService {
                }).collect(Collectors.toList());
            }
            if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
-                for (int i = currentTable.getRows()
-                        .size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
+                for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
                    List<Cell> row = currentTable.getRows().get(i);
                    if (row.size() == tableNonHeaderRow.size() && row.stream()
                            .allMatch(cell -> cell.getHeaderCells().isEmpty())) {
@ -185,7 +184,7 @@ public class SectionsBuilderService {

    private List<Cell> getRowWithNonHeaderCells(Table table) {

-        for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
+        for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
            List<Cell> row = table.getRows().get(i);
            boolean allNonHeader = true;
            for (Cell cell : row) {
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java
@ -29,11 +29,13 @@ public class Table extends AbstractTextContainer {
    @Setter
    private String headline;

-    @Getter
-    private int rowCount;
+    private int unrotatedRowCount;

-    @Getter
-    private int colCount;
+    private int unrotatedColCount;
+
+    private int rowCount = -1;
+
+    private int colCount = -1;

    private final int rotation;

@ -65,6 +67,25 @@ public class Table extends AbstractTextContainer {
    }


+    public int getRowCount() {
+
+        if (rowCount == -1) {
+            rowCount = getRows().size();
+        }
+        return rowCount;
+    }
+
+
+    public int getColCount() {
+
+        if (colCount == -1) {
+            colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
+        }
+        return colCount;
+
+    }
+
+
    /**
     * Detect header cells (either first row or first column):
     * Column is marked as header if cell text is bold and row cell text is not bold.
@ -72,100 +93,54 @@ public class Table extends AbstractTextContainer {
     */
    private void computeHeaders() {

+        if (rows == null) {
+            rows = computeRows();
+        }
        // A bold cell is a header cell as long as every cell to the left/top is bold, too
-        cells.forEach((position, cell) -> {
-            List<Cell> cellsToTheLeft = getCellsToTheLeft(position);
-            Cell lastHeaderCell = null;
-            for (Cell leftCell : cellsToTheLeft) {
-                if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks()
+        // we move from left to right and top to bottom
+        for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
+            List<Cell> rowCells = rows.get(rowIndex);
+            for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
+                Cell cell = rowCells.get(colIndex);
+                List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
+                Cell lastHeaderCell = null;
+                for (Cell leftCell : cellsToTheLeft) {
+                    if (leftCell.isHeaderCell()) {
+                        lastHeaderCell = leftCell;
+                    } else {
+                        break;
+                    }
+                }
+                if (lastHeaderCell != null) {
+                    cell.getHeaderCells().add(lastHeaderCell);
+                }
+                List<Cell> cellsToTheTop = new ArrayList<>();
+                for (int i = 0; i < rowIndex; i++) {
+                    try {
+                        cellsToTheTop.add(rows.get(i).get(colIndex));
+                    } catch (IndexOutOfBoundsException e) {
+                        log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
+                    }
+                }
+                for (Cell topCell : cellsToTheTop) {
+                    if (topCell.isHeaderCell()) {
+                        lastHeaderCell = topCell;
+                    } else {
+                        break;
+                    }
+                }
+                if (lastHeaderCell != null) {
+                    cell.getHeaderCells().add(lastHeaderCell);
+                }
+                if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
                        .get(0)
                        .getMostPopularWordStyle()
                        .equals("bold")) {
-                    lastHeaderCell = leftCell;
-                } else {
-                    break;
+                    cell.setHeaderCell(true);
                }
            }
-            if (lastHeaderCell != null) {
-                cell.getHeaderCells().add(lastHeaderCell);
-            }
-            lastHeaderCell = null;
-            List<Cell> cellsToTheTop = getCellToTheTop(position);
-            for (Cell topCell : cellsToTheTop) {
-                if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks()
-                        .get(0)
-                        .getMostPopularWordStyle()
-                        .equals("bold")) {
-                    lastHeaderCell = topCell;
-                } else {
-                    break;
-                }
-            }
-            if (lastHeaderCell != null) {
-                cell.getHeaderCells().add(lastHeaderCell);
-            }
-            if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
-                    .get(0)
-                    .getMostPopularWordStyle()
-                    .equals("bold")) {
-                cell.setHeaderCell(true);
-            }
-        });
-
-    }
-
-
-    private List<Cell> getCellsToTheLeft(CellPosition cellPosition) {
-
-        List<Cell> result = new ArrayList<>();
-        if (cellPosition.getCol() == 0) {
-            return result;
-        }
-        int row = cellPosition.getRow();
-        for (int i = cellPosition.getCol() - 1; i >= 0; i--) {
-            if (cells.get(new CellPosition(row, i)) != null) {
-                result.add(cells.get(new CellPosition(row, i)));
-            } else {
-                Cell spanningCell = null;
-                while (spanningCell == null && row >= 0) {
-                    row--;
-                    spanningCell = cells.get(new CellPosition(row, i));
-                }
-                if (spanningCell != null) {
-                    result.add(spanningCell);
-                }
-                row = cellPosition.getRow();
-            }
        }
-        Collections.reverse(result);
-        return result;
-    }

-
-    private List<Cell> getCellToTheTop(CellPosition cellPosition) {
-
-        List<Cell> result = new ArrayList<>();
-        if (cellPosition.getRow() == 0) {
-            return result;
-        }
-        int col = cellPosition.getCol();
-        for (int i = cellPosition.getRow() - 1; i >= 0; i--) {
-            if (cells.get(new CellPosition(i, col)) != null) {
-                result.add(cells.get(new CellPosition(i, col)));
-            } else {
-                Cell spanningCell = null;
-                while (spanningCell == null && col >= 0) {
-                    col--;
-                    spanningCell = cells.get(new CellPosition(i, col));
-                }
-                if (spanningCell != null) {
-                    result.add(spanningCell);
-                }
-                col = cellPosition.getCol();
-            }
-        }
-        Collections.reverse(result);
-        return result;
    }


@ -173,9 +148,9 @@ public class Table extends AbstractTextContainer {

        List<List<Cell>> rows = new ArrayList<>();
        if (rotation == 90) {
-            for (int i = 0; i < colCount; i++) { // rows
+            for (int i = 0; i < unrotatedColCount; i++) { // rows
                List<Cell> lastRow = new ArrayList<>();
-                for (int j = rowCount - 1; j >= 0; j--) { // cols
+                for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
                    Cell cell = cells.get(new CellPosition(j, i));
                    if (cell != null) {
                        lastRow.add(cell);
@ -184,9 +159,9 @@ public class Table extends AbstractTextContainer {
                rows.add(lastRow);
            }
        } else if (rotation == 270) {
-            for (int i = colCount - 1; i >= 0; i--) { // rows
+            for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
                List<Cell> lastRow = new ArrayList<>();
-                for (int j = 0; j < rowCount; j++) { // cols
+                for (int j = 0; j < unrotatedRowCount; j++) { // cols
                    Cell cell = cells.get(new CellPosition(i, j));
                    if (cell != null) {
                        lastRow.add(cell);
@ -195,9 +170,9 @@ public class Table extends AbstractTextContainer {
                rows.add(lastRow);
            }
        } else {
-            for (int i = 0; i < rowCount; i++) {
+            for (int i = 0; i < unrotatedRowCount; i++) {
                List<Cell> lastRow = new ArrayList<>();
-                for (int j = 0; j < colCount; j++) {
+                for (int j = 0; j < unrotatedColCount; j++) {
                    Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
                    if (cell != null) {
                        lastRow.add(cell);
@ -214,8 +189,8 @@ public class Table extends AbstractTextContainer {

    private void add(Cell chunk, int row, int col) {

-        rowCount = Math.max(rowCount, row + 1);
-        colCount = Math.max(colCount, col + 1);
+        unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
+        unrotatedColCount = Math.max(unrotatedColCount, col + 1);

        CellPosition cp = new CellPosition(row, col);
        cells.put(cp, chunk);
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
@ -130,7 +130,7 @@ public class EntityRedactionServiceTest {
            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
            entityRedactionService.processDocument(classifiedDoc, null);
            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
-            assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
+            assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 names, 1 address, 1 Y and 2 N entities
        }
    }

@ -193,6 +193,7 @@ public class EntityRedactionServiceTest {

    }

+
    @Test
    public void headerPropagation() throws IOException {

@ -219,6 +220,31 @@ public class EntityRedactionServiceTest {
    }


+    @Test
+    public void testNGuideline() throws IOException {
+
+        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Empty Tabular Data.pdf");
+
+        DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
+                .entries(Collections.singletonList("Aldershof S."))
+                .build();
+
+        when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
+        when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
+        DictionaryResponse addressResponse = DictionaryResponse.builder()
+                .entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))
+                .build();
+        when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
+
+        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
+            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+            entityRedactionService.processDocument(classifiedDoc, null);
+            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+            assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
+        }
+    }
+
+
    @Before
    public void stubRedaction() {
        String tableRules = "package drools\n" +
@ -226,12 +252,20 @@ public class EntityRedactionServiceTest {
                "import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
                "\n" +
                "global Section section\n" +
+                "rule \"8: Not redacted because Vertebrate Study = N\"\n" +
+                "    when\n" +
+                "        Section(rowEquals(\"Vertebrate study Y/N\", \"N\"))\n" +
+                "    then\n" +
+                "        section.redactNot(\"name\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
+                "        section.redactNot(\"address\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
+                "        section.highlightCell(\"Vertebrate study Y/N\", 8, \"hint_only\");\n" +
+                "    end\n" +
                "rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" +
                "    when\n" +
                "        Section(rowEquals(\"Vertebrate study Y/N\", \"Y\"))\n" +
                "    then\n" +
                "        section.redact(\"name\", 9, \"Redacted because row is a vertebrate study\");\n" +
-                "        section.redact(\"address\", 9, \"Redacted because rows is a vertebrate study\");\n" +
+                "        section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\");\n" +
                "        section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" +
                "    end";
        when(rulesClient.getVersion()).thenReturn(1L);
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java
@ -146,4 +146,44 @@ public class PdfSegmentationServiceTest {
        }
    }

+
+    @Test
+    public void testHeaderCellsForRotatedTable() throws IOException {
+
+        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
+
+        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
+            Document document = pdfSegmentationService.parseDocument(pdDocument);
+            assertThat(document.getParagraphs()
+                    .stream()
+                    .flatMap(paragraph -> paragraph.getTables().stream())
+                    .collect(Collectors.toList())).isNotEmpty();
+            Table firstTable = document.getParagraphs()
+                    .stream()
+                    .flatMap(paragraph -> paragraph.getTables().stream())
+                    .collect(Collectors.toList())
+                    .get(0);
+            assertThat(firstTable.getColCount()).isEqualTo(8);
+            assertThat(firstTable.getRowCount()).isEqualTo(1);
+            Table secondTable = document.getParagraphs()
+                    .stream()
+                    .flatMap(paragraph -> paragraph.getTables().stream())
+                    .collect(Collectors.toList())
+                    .get(1);
+            assertThat(secondTable.getColCount()).isEqualTo(8);
+            assertThat(secondTable.getRowCount()).isEqualTo(6);
+            List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
+                    .get(0)
+                    .stream()
+                    .map(Collections::singletonList)
+                    .collect(Collectors.toList());
+            assertThat(secondTable.getRows().stream()
+                    .allMatch(row -> row.stream()
+                            .map(Cell::getHeaderCells)
+                            .collect(Collectors.toList())
+                            .equals(firstTableHeaderCells)))
+                    .isTrue();
+        }
+    }
+
 }
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal