Pull request #25: RED-101 & RED-102: Make table structure detection more robust

Merge in RED/redaction-service from bugfix/RED-101 to master * commit '76369f13f8d41154f3dd3690af81b9567ca1133e': Remove redundant comment RED-101: Add features as requested in PR Add unit test for table structure requirements Make table structure detection more robust
2020-08-24 11:09:17 +02:00 · 2020-08-24 11:09:17 +02:00 · 8c08bb3664
commit 8c08bb3664
parent 14b4f4ab8a 76369f13f8
17 changed files with 582 additions and 359 deletions
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java
@ -3,6 +3,8 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
 import java.util.ArrayList;
 import java.util.List;

+import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
+
 import lombok.Data;
 import lombok.EqualsAndHashCode;

@ -16,6 +18,7 @@ public class Entity {
    private boolean redaction;
    private String redactionReason;
    private List<EntityPositionSequence> positionSequences = new ArrayList<>();
+    private List<TextPositionSequence> targetSequences;
    private Integer start;
    private Integer end;

--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java
@ -1,6 +1,7 @@
 package com.iqser.red.service.redaction.v1.server.redaction.model;

 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.regex.Pattern;

@ -25,9 +26,16 @@ public class SearchableText {
    }


-    @SuppressWarnings("checkstyle:ModifiedControlVariable")
    public List<EntityPositionSequence> getSequences(String searchString, boolean caseInsensitive) {

+        return getSequences(searchString, caseInsensitive, null);
+
+    }
+
+    @SuppressWarnings("checkstyle:ModifiedControlVariable")
+    public List<EntityPositionSequence> getSequences(String searchString, boolean caseInsensitive,
+                                                     List<TextPositionSequence> sequencesSubList) {
+
        String normalizedSearchString;
        if (caseInsensitive) {
            normalizedSearchString = searchString.toLowerCase();
@ -40,37 +48,50 @@ public class SearchableText {

        List<TextPositionSequence> crossSequenceParts = new ArrayList<>();
        List<EntityPositionSequence> finalMatches = new ArrayList<>();
-        for (int i = 0; i < sequences.size(); i++) {
-            TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage());
-            for (int j = 0; j < sequences.get(i).length(); j++) {

-                if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInsensitive) == ' ' && sequences.get(i - 1)
-                        .charAt(sequences.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && sequences.get(i)
-                        .charAt(j, caseInsensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInsensitive) == ' ') {
-                    if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) {
+        List<TextPositionSequence> searchSpace;
+        if (sequencesSubList != null) {
+            int subListIndex = Collections.indexOfSubList(sequences, sequencesSubList);
+            if (subListIndex != -1) {
+                searchSpace = sequences.subList(subListIndex, subListIndex + sequencesSubList.size());
+            } else {
+                searchSpace = sequences;
+            }
+        } else {
+            searchSpace = sequences;
+        }
+
+        for (int i = 0; i < searchSpace.size(); i++) {
+            TextPositionSequence partMatch = new TextPositionSequence(searchSpace.get(i).getPage());
+            for (int j = 0; j < searchSpace.get(i).length(); j++) {
+
+                if (i > 0 && j == 0 && searchSpace.get(i).charAt(0, caseInsensitive) == ' ' && searchSpace.get(i - 1)
+                        .charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && searchSpace.get(i)
+                        .charAt(j, caseInsensitive) == ' ' && searchSpace.get(i).charAt(j - 1, caseInsensitive) == ' ') {
+                    if (j == searchSpace.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) {
                        crossSequenceParts.add(partMatch);
                    }
                    continue;
                }

-                if (j == 0 && sequences.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && sequences.get(i - 1)
-                        .charAt(sequences.get(i - 1)
+                if (j == 0 && searchSpace.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && searchSpace.get(i - 1)
+                        .charAt(searchSpace.get(i - 1)
                                .length() - 1, caseInsensitive) != ' ' && searchChars[counter] == ' ') {
                    counter++;
                }

-                if (sequences.get(i)
-                        .charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && sequences.get(i)
+                if (searchSpace.get(i)
+                        .charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && searchSpace.get(i)
                        .charAt(j, caseInsensitive) == '-') {

-                    if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i)
-                            .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1)
-                            .charAt(sequences.get(i - 1)
-                                    .length() - 1, caseInsensitive)) || j == 0 && i != 0 && sequences.get(i - 1)
-                            .charAt(sequences.get(i - 1).length() - 1, caseInsensitive) != ' ' && sequences.get(i)
+                    if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(searchSpace.get(i)
+                            .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(searchSpace.get(i - 1)
+                            .charAt(searchSpace.get(i - 1)
+                                    .length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1)
+                            .charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i)
                            .charAt(j, caseInsensitive) != ' ') {
-                        partMatch.add(sequences.get(i).textPositionAt(j));
-                        if (!(j == sequences.get(i).length() - 1 && sequences.get(i)
+                        partMatch.add(searchSpace.get(i).textPositionAt(j));
+                        if (!(j == searchSpace.get(i).length() - 1 && searchSpace.get(i)
                                .charAt(j, caseInsensitive) == '-' && searchChars[counter] != '-')) {
                            counter++;
                        }
@ -79,19 +100,19 @@ public class SearchableText {
                    if (counter == searchString.length()) {
                        crossSequenceParts.add(partMatch);

-                        if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i)
-                                .length() - 1 && isSeparator(sequences.get(i)
-                                .charAt(j + 1, caseInsensitive)) || j == sequences.get(i)
-                                .length() - 1 && isSeparator(sequences.get(i + 1)
-                                .charAt(0, caseInsensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i)
-                                .charAt(j, caseInsensitive) != ' ' && sequences.get(i + 1)
+                        if (i == searchSpace.size() - 1 && j == searchSpace.get(i).length() - 1 || j != searchSpace.get(i)
+                                .length() - 1 && isSeparator(searchSpace.get(i)
+                                .charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i)
+                                .length() - 1 && isSeparator(searchSpace.get(i + 1)
+                                .charAt(0, caseInsensitive)) || j == searchSpace.get(i).length() - 1 && searchSpace.get(i)
+                                .charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1)
                                .charAt(0, caseInsensitive) != ' ') {
                            finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts));
                        }

                        counter = 0;
                        crossSequenceParts = new ArrayList<>();
-                        partMatch = new TextPositionSequence(sequences.get(i).getPage());
+                        partMatch = new TextPositionSequence(searchSpace.get(i).getPage());
                    }
                } else {
                    counter = 0;
@ -99,16 +120,17 @@ public class SearchableText {
                        j--;
                    }
                    crossSequenceParts = new ArrayList<>();
-                    partMatch = new TextPositionSequence(sequences.get(i).getPage());
+                    partMatch = new TextPositionSequence(searchSpace.get(i).getPage());
                }

-                if (j == sequences.get(i).length() - 1 && counter != 0) {
+                if (j == searchSpace.get(i).length() - 1 && counter != 0) {
                    crossSequenceParts.add(partMatch);
                }
            }
        }

        return finalMatches;
+
    }


--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java
@ -7,9 +7,10 @@ import java.util.Map;
 import java.util.Set;
 import java.util.regex.Pattern;

-import org.apache.commons.collections4.CollectionUtils;
 import org.apache.commons.lang3.StringUtils;

+import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
+
 import lombok.Builder;
 import lombok.Data;
 import lombok.extern.slf4j.Slf4j;
@ -31,7 +32,21 @@ public class Section {

    private int sectionNumber;

-    private Map<String, String> tabularData;
+    private Map<String, TextBlock> tabularData;
+
+
+    public boolean isVertebrateStudy() {
+        return tabularData != null
+                && tabularData.containsKey("Vertebrate study Y/N")
+                && tabularData.get("Vertebrate study Y/N").getText().equals("Y");
+    }
+
+
+    public boolean isNotVertebrateStudy() {
+        return tabularData != null
+                && tabularData.containsKey("Vertebrate study Y/N")
+                && tabularData.get("Vertebrate study Y/N").getText().equals("N");
+    }


    public boolean contains(String type) {
@ -163,20 +178,16 @@ public class Section {

    public void highlightCell(String cellHeader, int ruleNumber) {

-        String value = tabularData.get(cellHeader);
+        TextBlock value = tabularData.get(cellHeader);
        if (value == null) {
            log.warn("Could not find any data for {}.", cellHeader);
        } else {
-            Set<Entity> found = findEntities(value, "must_redact");
-            if (CollectionUtils.isEmpty(found)) {
-                log.warn("Could not identify value {} in row.", value);
-            } else {
-                Entity entity = found.iterator().next();
-                entity.setRedaction(false);
-                entity.setMatchedRule(ruleNumber);
-                entity.setRedactionReason(cellHeader);
-                entities.add(entity);
-            }
+            Entity entity = new Entity(value.getText(), "must_redact", 0, value.getText().length(), headline, sectionNumber);
+            entity.setRedaction(false);
+            entity.setMatchedRule(ruleNumber);
+            entity.setRedactionReason(cellHeader);
+            entity.setTargetSequences(value.getSequences()); // Make sure no other cells with same content are highlighted
+            entities.add(entity);
        }

    }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
@ -51,24 +51,27 @@ public class EntityRedactionService {
            List<Table> tables = paragraph.getTables();

            for (Table table : tables) {
-                List<String> metadata = table.getHeaders();
                for (List<Cell> row : table.getRows()) {
                    SearchableText searchableRow = new SearchableText();
-                    List<String> cellValues = new ArrayList<>();
+                    Map<String, TextBlock> tabularData = new HashMap<>();
                    for (Cell cell : row) {
-                        if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks())) {
-                            cellValues.add(null);
+                        if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
                            continue;
                        }
-                        cellValues.add(cell.getTextBlocks().get(0).getText());
                        addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
+                        cell.getHeaderCells().forEach(headerCell -> {
+                            String headerName = headerCell.getTextBlocks().get(0).getText()
+                                    .replaceAll("\n", " ")
+                                    .replaceAll(" ", " ");
+                            tabularData.put(headerName, cell.getTextBlocks().get(0));
+                        });
                        for (TextBlock textBlock : cell.getTextBlocks()) {
                            searchableRow.addAll(textBlock.getSequences());
                        }
+
                    }
                    Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);

-                    Map<String, String> tabularData = toMap(metadata, cellValues);
                    Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
                            .entities(rowEntities)
                            .text(searchableRow.getAsStringWithLinebreaks())
@ -116,35 +119,15 @@ public class EntityRedactionService {
    }


-    private Map<String, String> toMap(List<String> keys, List<String> values) {
-
-        if (keys.size() != values.size()) {
-            log.warn("Cannot merge lists of unequal size, returning empty map.");
-            return new HashMap<>();
-        }
-        Map<String, String> result = new HashMap<>();
-        for (int i = 0; i < keys.size(); i++) {
-            String value = values.get(i);
-            if (value == null) {
-                continue;
-            }
-            result.put(keys.get(i), value);
-        }
-
-        return result;
-
-    }
-
-
    private Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text) {

        removeEntitiesContainedInLarger(entities);

        for (Entity entity : entities) {
            if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
-                entity.setPositionSequences(text.getSequences(entity.getWord(), true));
+                entity.setPositionSequences(text.getSequences(entity.getWord(), true, entity.getTargetSequences()));
            } else {
-                entity.setPositionSequences(text.getSequences(entity.getWord(), false));
+                entity.setPositionSequences(text.getSequences(entity.getWord(), false, entity.getTargetSequences()));
            }
        }

--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java
@ -1,11 +1,11 @@
 package com.iqser.red.service.redaction.v1.server.segmentation;

 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;

 import org.apache.commons.collections4.CollectionUtils;
-import org.apache.commons.lang3.StringUtils;
 import org.springframework.stereotype.Service;

 import com.iqser.red.service.redaction.v1.server.classification.model.Document;
@ -13,6 +13,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Page;
 import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
 import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
 import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
+import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
 import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;

@Service
@ -85,10 +86,20 @@ public class SectionsBuilderService {
                    table.setHeadline("Table in: " + lastHeadline);
                }
                // Distribute header information for subsequent tables
-                if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable) &&
-                        (previousTable.isVerticalHeader() && previousTable.getRowCount() == table.getRowCount() ||
-                                previousTable.getColCount() == table.getColCount())) {
-                    table.setHeaders(previousTable.getHeaders());
+                if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable)) {
+                    List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
+                    List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(table);
+                    if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
+                        for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
+                            List<Cell> row = table.getRows().get(i);
+                            if (row.size() == previousTableNonHeaderRow.size()
+                                    && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
+                                for (int j = 0; j < row.size(); j++) {
+                                    row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
+                                }
+                            }
+                        }
+                    }
                }

                if (textBlock != null && !alreadyAdded) {
@ -141,14 +152,32 @@ public class SectionsBuilderService {

    private boolean hasInvalidHeaderInformation(Table table) {

-        if (CollectionUtils.isEmpty(table.getHeaders())) {
-            return true;
-        }
-        if (table.getHeaders().stream().anyMatch(StringUtils::isEmpty)) {
-            return true;
+        return table.getRows().stream()
+                .flatMap(row -> row.stream()
+                        .filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
+                .findAny()
+                .isEmpty();
+
+    }
+
+
+    private List<Cell> getRowWithNonHeaderCells(Table table) {
+
+        for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
+            List<Cell> row = table.getRows().get(i);
+            boolean allNonHeader = true;
+            for (Cell cell : row) {
+                if (cell.isHeaderCell()) {
+                    allNonHeader = false;
+                    break;
+                }
+            }
+            if (allNonHeader) {
+                return row;
+            }
        }

-        return false;
+        return Collections.emptyList();

    }

--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java
@ -16,10 +16,14 @@ public class Cell extends Rectangle {

    private List<TextBlock> textBlocks = new ArrayList<>();

+    private List<Cell> headerCells = new ArrayList<>();
+
+    private boolean isHeaderCell;

    public Cell(Point2D topLeft, Point2D bottomRight) {

-        super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight
+        super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()),
+                (float) (bottomRight
                .getY() - topLeft.getY()));
    }

@ -29,4 +33,4 @@ public class Cell extends Rectangle {
        textBlocks.add(textBlock);
    }

-}
+}
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java
@ -0,0 +1,22 @@
+package com.iqser.red.service.redaction.v1.server.tableextraction.model;
+
+import lombok.RequiredArgsConstructor;
+import lombok.Value;
+
+@Value
+@RequiredArgsConstructor
+public class CellPosition implements Comparable<CellPosition> {
+
+    int row;
+
+    int col;
+
+
+    @Override
+    public int compareTo(CellPosition other) {
+
+        int rowDiff = row - other.row;
+        return rowDiff != 0 ? rowDiff : col - other.col;
+    }
+
+}
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java
@ -8,12 +8,10 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
-import java.util.stream.Collectors;

 import org.apache.commons.collections4.CollectionUtils;

 import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
-import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
 import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;

 import lombok.Getter;
@ -41,12 +39,6 @@ public class Table extends AbstractTextContainer {

    private List<List<Cell>> rows;

-    @Getter
-    @Setter
-    private List<String> headers;
-
-    @Getter
-    private boolean verticalHeader;

    public Table(List<Cell> cells, Rectangle area, int rotation) {

@ -65,7 +57,7 @@ public class Table extends AbstractTextContainer {

        if (rows == null) {
            rows = computeRows();
-            headers = computeHeaders();
+            computeHeaders();
        }

        return rows;
@ -78,72 +70,105 @@ public class Table extends AbstractTextContainer {
     * Column is marked as header if cell text is bold and row cell text is not bold.
     * Defaults to row.
     */
-    private List<String> computeHeaders() {
+    private void computeHeaders() {

-        boolean allBold = true;
-        if (rows.isEmpty()) {
-            return Collections.emptyList();
-        }
-        List<Cell> rowCells = rows.get(0);
-        for (Cell cell : rowCells) {
-            if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks()) ||
-                    !cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
-                allBold = false;
-                break;
-            }
-        }
-        if (!allBold) {
-            allBold = true;
-            List<Cell> firstColCells = new ArrayList<>();
-            for (List<Cell> row : rows) {
-                Cell firstInRow = row.get(0);
-                if (firstInRow == null || CollectionUtils.isEmpty(firstInRow.getTextBlocks()) ||
-                        !firstInRow.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
-                    allBold = false;
+        // A bold cell is a header cell as long as every cell to the left/top is bold, too
+        cells.forEach((position, cell) -> {
+            List<Cell> cellsToTheLeft = getCellsToTheLeft(position);
+            Cell lastHeaderCell = null;
+            for (Cell leftCell : cellsToTheLeft) {
+                if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks()
+                        .get(0)
+                        .getMostPopularWordStyle()
+                        .equals("bold")) {
+                    lastHeaderCell = leftCell;
+                } else {
                    break;
                }
-                firstColCells.add(firstInRow);
            }
-            if (allBold) {
-                log.info("Headers are in first column");
-                verticalHeader = true;
-                return firstColCells.stream().map(cell -> {
-                    if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) {
-                        return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText())
-                                .replaceAll("\n", " ")
-                                .replaceAll(" ", " ");
-                    } else {
-                        return null;
-                    }
-                }).collect(Collectors.toList());
-            } else {
-                log.info("Headers are defaulted in first row.");
-                return rowCells.stream().map(cell -> {
-                    if (cell != null && CollectionUtils.isNotEmpty(cell.getTextBlocks())) {
-                        return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText())
-                                .replaceAll("\n", " ")
-                                .replaceAll(" ", " ");
-                    } else {
-                        return null;
-                    }
-                }).collect(Collectors.toList());
+            if (lastHeaderCell != null) {
+                cell.getHeaderCells().add(lastHeaderCell);
            }
-        } else {
-            log.info("Headers are in first row.");
-            return rowCells.stream().map(cell -> {
-                if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) {
-                    return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText())
-                            .replaceAll("\n", " ")
-                            .replaceAll(" ", " ");
+            lastHeaderCell = null;
+            List<Cell> cellsToTheTop = getCellToTheTop(position);
+            for (Cell topCell : cellsToTheTop) {
+                if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks()
+                        .get(0)
+                        .getMostPopularWordStyle()
+                        .equals("bold")) {
+                    lastHeaderCell = topCell;
                } else {
-                    return null;
+                    break;
                }
-            }).collect(Collectors.toList());
-        }
+            }
+            if (lastHeaderCell != null) {
+                cell.getHeaderCells().add(lastHeaderCell);
+            }
+            if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
+                    .get(0)
+                    .getMostPopularWordStyle()
+                    .equals("bold")) {
+                cell.setHeaderCell(true);
+            }
+        });

    }


+    private List<Cell> getCellsToTheLeft(CellPosition cellPosition) {
+
+        List<Cell> result = new ArrayList<>();
+        if (cellPosition.getCol() == 0) {
+            return result;
+        }
+        int row = cellPosition.getRow();
+        for (int i = cellPosition.getCol() - 1; i >= 0; i--) {
+            if (cells.get(new CellPosition(row, i)) != null) {
+                result.add(cells.get(new CellPosition(row, i)));
+            } else {
+                Cell spanningCell = null;
+                while (spanningCell == null && row >= 0) {
+                    row--;
+                    spanningCell = cells.get(new CellPosition(row, i));
+                }
+                if (spanningCell != null) {
+                    result.add(spanningCell);
+                }
+                row = cellPosition.getRow();
+            }
+        }
+        Collections.reverse(result);
+        return result;
+    }
+
+
+    private List<Cell> getCellToTheTop(CellPosition cellPosition) {
+
+        List<Cell> result = new ArrayList<>();
+        if (cellPosition.getRow() == 0) {
+            return result;
+        }
+        int col = cellPosition.getCol();
+        for (int i = cellPosition.getRow() - 1; i >= 0; i--) {
+            if (cells.get(new CellPosition(i, col)) != null) {
+                result.add(cells.get(new CellPosition(i, col)));
+            } else {
+                Cell spanningCell = null;
+                while (spanningCell == null && col >= 0) {
+                    col--;
+                    spanningCell = cells.get(new CellPosition(i, col));
+                }
+                if (spanningCell != null) {
+                    result.add(spanningCell);
+                }
+                col = cellPosition.getCol();
+            }
+        }
+        Collections.reverse(result);
+        return result;
+    }
+
+
    private List<List<Cell>> computeRows() {

        List<List<Cell>> rows = new ArrayList<>();
@ -152,7 +177,9 @@ public class Table extends AbstractTextContainer {
                List<Cell> lastRow = new ArrayList<>();
                for (int j = rowCount - 1; j >= 0; j--) { // cols
                    Cell cell = cells.get(new CellPosition(j, i));
-                    lastRow.add(cell);
+                    if (cell != null) {
+                        lastRow.add(cell);
+                    }
                }
                rows.add(lastRow);
            }
@ -161,7 +188,9 @@ public class Table extends AbstractTextContainer {
                List<Cell> lastRow = new ArrayList<>();
                for (int j = 0; j < rowCount; j++) { // cols
                    Cell cell = cells.get(new CellPosition(i, j));
-                    lastRow.add(cell);
+                    if (cell != null) {
+                        lastRow.add(cell);
+                    }
                }
                rows.add(lastRow);
            }
@ -170,7 +199,9 @@ public class Table extends AbstractTextContainer {
                List<Cell> lastRow = new ArrayList<>();
                for (int j = 0; j < colCount; j++) {
                    Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
-                    lastRow.add(cell);
+                    if (cell != null) {
+                        lastRow.add(cell);
+                    }
                }
                rows.add(lastRow);
            }
@ -220,20 +251,21 @@ public class Table extends AbstractTextContainer {
            while (rowCells.hasNext()) {
                Cell cell = rowCells.next();
                if (i > 0) {
-                    List<List<Cell>> others = rowsOfCells(si.contains(new Rectangle(cell.getBottom(), si.getBounds()
-                            .getLeft(), cell.getLeft() - si.getBounds().getLeft() + 1, si.getBounds().getBottom() - cell
-                            .getBottom())));
+                    Rectangle rectangle = new Rectangle(cell.getBottom(),
+                            si.getBounds().getLeft(),
+                            cell.getLeft() - si.getBounds().getLeft() + 1,
+                            si.getBounds().getBottom() - cell.getBottom());
+                    List<List<Cell>> others = rowsOfCells(si.contains(rectangle));

                    for (List<Cell> r : others) {
                        jumpToColumn = Math.max(jumpToColumn, r.size());
                    }
-                }

-                while (startColumn != jumpToColumn) {
-                    add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn);
-                    startColumn++;
+                    while (startColumn != jumpToColumn) {
+                        add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn);
+                        startColumn++;
+                    }
                }
-
                add(cell, i, startColumn);
                previousNonNullCellForColumnIndex.put(startColumn, cell);
                startColumn++;
@ -243,26 +275,23 @@ public class Table extends AbstractTextContainer {
    }


-    private static List<List<Cell>> rowsOfCells(List<Cell> cells) {
+    private List<List<Cell>> rowsOfCells(List<Cell> cells) {

-        Cell c;
-        float lastTop;
        List<List<Cell>> rv = new ArrayList<>();
-        List<Cell> lastRow;

        if (cells.isEmpty()) {
            return rv;
        }
-
        cells.sort(Comparator.comparingDouble(Rectangle::getLeft));

-        cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2), Utils.round(arg1
+        cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2),
+                Utils.round(arg1
                .getBottom(), 2))));

        Iterator<Cell> iter = cells.iterator();
-        c = iter.next();
-        lastTop = c.getBottom();
-        lastRow = new ArrayList<>();
+        Cell c = iter.next();
+        float lastTop = c.getBottom();
+        List<Cell> lastRow = new ArrayList<>();
        lastRow.add(c);
        rv.add(lastRow);

@ -349,51 +378,4 @@ public class Table extends AbstractTextContainer {
        return sb.toString();
    }

-
-    static class CellPosition implements Comparable<CellPosition> {
-
-        CellPosition(int row, int col) {
-
-            this.row = row;
-            this.col = col;
-        }
-
-
-        final int row;
-        final int col;
-
-
-        @Override
-        public int hashCode() {
-
-            return row + 101 * col;
-        }
-
-
-        @Override
-        public boolean equals(Object obj) {
-
-            if (this == obj) {
-                return true;
-            }
-            if (obj == null) {
-                return false;
-            }
-            if (getClass() != obj.getClass()) {
-                return false;
-            }
-            CellPosition other = (CellPosition) obj;
-            return row == other.row && col == other.col;
-        }
-
-
-        @Override
-        public int compareTo(CellPosition other) {
-
-            int rowDiff = row - other.row;
-            return rowDiff != 0 ? rowDiff : col - other.col;
-        }
-
-    }
-
 }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java
@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.service;

 import java.awt.geom.Point2D;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
@ -25,26 +24,28 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
 import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;

@Service
-@SuppressWarnings("all")
 public class TableExtractionService {

-    public void extractTables(CleanRulings cleanRulings, Page page){
+    public void extractTables(CleanRulings cleanRulings, Page page) {

        List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());

-        Iterator<AbstractTextContainer> itty = page.getTextBlocks().iterator();
-        while (itty.hasNext()) {
-            TextBlock textBlock = (TextBlock) itty.next();
+        for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) {
+            TextBlock textBlock = (TextBlock) abstractTextContainer;
            for (Cell cell : cells) {
-                if (cell.intersects(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight())) {
+                if (cell.intersects(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(),
+                        textBlock.getHeight())) {
                    cell.addTextBlock(textBlock);
                    break;
                }
            }
        }

-        List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells)
-                .stream()
+        cells = new ArrayList<>(new HashSet<>(cells));
+        Utils.sort(cells, Rectangle.ILL_DEFINED_ORDER);
+
+
+        List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream()
                .filter(r -> r.getWidth() > 0f && r.getHeight() > 0f)
                .collect(Collectors.toList());

@ -63,9 +64,9 @@ public class TableExtractionService {
        for (Table table : tables) {
            int position = -1;

-            itty = page.getTextBlocks().iterator();
+            Iterator<AbstractTextContainer> itty = page.getTextBlocks().iterator();
            while (itty.hasNext()) {
-                AbstractTextContainer textBlock = (AbstractTextContainer) itty.next();
+                AbstractTextContainer textBlock = itty.next();
                if (table.contains(textBlock)) {
                    if (position == -1) {
                        position = page.getTextBlocks().indexOf(textBlock);
@ -79,17 +80,18 @@ public class TableExtractionService {
        }
    }

+
    public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
+
        List<Cell> cellsFound = new ArrayList<>();
-        Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
+        Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines,
+                verticalRulingLines);
        List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
-        Collections.sort(intersectionPointsList, POINT_COMPARATOR);
-        boolean doBreak;
+        intersectionPointsList.sort(POINT_COMPARATOR);

        for (int i = 0; i < intersectionPointsList.size(); i++) {
            Point2D topLeft = intersectionPointsList.get(i);
            Ruling[] hv = intersectionPoints.get(topLeft);
-            doBreak = false;

            // CrossingPointsDirectlyBelow( topLeft );
            List<Point2D> xPoints = new ArrayList<>();
@ -106,10 +108,6 @@ public class TableExtractionService {
            }
            outer:
            for (Point2D xPoint : xPoints) {
-                if (doBreak) {
-                    break;
-                }
-
                // is there a vertical edge b/w topLeft and xPoint?
                if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
                    continue;
@ -120,11 +118,9 @@ public class TableExtractionService {
                        continue;
                    }
                    Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
-                    if (intersectionPoints.containsKey(btmRight)
-                            && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
-                            && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
+                    if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints
+                            .get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
                        cellsFound.add(new Cell(topLeft, btmRight));
-                        doBreak = true;
                        break outer;
                    }
                }
@ -139,7 +135,7 @@ public class TableExtractionService {
    }


-    public List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
+    private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
        // via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
        List<Rectangle> rectangles = new ArrayList<>();
        Set<Point2D> pointSet = new HashSet<>();
@ -147,10 +143,6 @@ public class TableExtractionService {
        Map<Point2D, Point2D> edgesV = new HashMap<>();
        int i = 0;

-        cells = new ArrayList<>(new HashSet<>(cells));
-
-        Utils.sort(cells, Rectangle.ILL_DEFINED_ORDER);
-
        for (Rectangle cell : cells) {
            for (Point2D pt : cell.getPoints()) {
                if (pointSet.contains(pt)) { // shared vertex, remove it
@ -163,10 +155,10 @@ public class TableExtractionService {

        // X first sort
        List<Point2D> pointsSortX = new ArrayList<>(pointSet);
-        Collections.sort(pointsSortX, X_FIRST_POINT_COMPARATOR);
+        pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
        // Y first sort
        List<Point2D> pointsSortY = new ArrayList<>(pointSet);
-        Collections.sort(pointsSortY, POINT_COMPARATOR);
+        pointsSortY.sort(POINT_COMPARATOR);

        while (i < pointSet.size()) {
            float currY = (float) pointsSortY.get(i).getY();
@ -203,13 +195,12 @@ public class TableExtractionService {
                    nextVertex = edgesV.get(curr.point);
                    edgesV.remove(curr.point);
                    lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
-                    polygon.add(lastAddedVertex);
                } else {
                    nextVertex = edgesH.get(curr.point);
                    edgesH.remove(curr.point);
                    lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
-                    polygon.add(lastAddedVertex);
                }
+                polygon.add(lastAddedVertex);

                if (lastAddedVertex.equals(polygon.get(0))) {
                    // closed polygon
@ -227,10 +218,10 @@ public class TableExtractionService {

        // calculate grid-aligned minimum area rectangles for each found polygon
        for (List<PolygonVertex> poly : polygons) {
-            float top = java.lang.Float.MAX_VALUE;
-            float left = java.lang.Float.MAX_VALUE;
-            float bottom = java.lang.Float.MIN_VALUE;
-            float right = java.lang.Float.MIN_VALUE;
+            float top = Float.MAX_VALUE;
+            float left = Float.MAX_VALUE;
+            float bottom = Float.MIN_VALUE;
+            float right = Float.MIN_VALUE;
            for (PolygonVertex pt : poly) {
                top = (float) Math.min(top, pt.point.getY());
                left = (float) Math.min(left, pt.point.getX());
@ -244,69 +235,66 @@ public class TableExtractionService {
    }


-    private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = new Comparator<Point2D>() {
-        @Override
-        public int compare(Point2D arg0, Point2D arg1) {
-            int rv = 0;
-            float arg0X = Utils.round(arg0.getX(), 2);
-            float arg0Y = Utils.round(arg0.getY(), 2);
-            float arg1X = Utils.round(arg1.getX(), 2);
-            float arg1Y = Utils.round(arg1.getY(), 2);
+    private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {

-            if (arg0X > arg1X) {
-                rv = 1;
-            } else if (arg0X < arg1X) {
-                rv = -1;
-            } else if (arg0Y > arg1Y) {
-                rv = 1;
-            } else if (arg0Y < arg1Y) {
-                rv = -1;
-            }
-            return rv;
+        int rv = 0;
+        float arg0X = Utils.round(arg0.getX(), 2);
+        float arg0Y = Utils.round(arg0.getY(), 2);
+        float arg1X = Utils.round(arg1.getX(), 2);
+        float arg1Y = Utils.round(arg1.getY(), 2);
+
+        if (arg0X > arg1X) {
+            rv = 1;
+        } else if (arg0X < arg1X) {
+            rv = -1;
+        } else if (arg0Y > arg1Y) {
+            rv = 1;
+        } else if (arg0Y < arg1Y) {
+            rv = -1;
        }
+        return rv;
    };

+    private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {

-    private static final Comparator<Point2D> POINT_COMPARATOR = new Comparator<Point2D>() {
-        @Override
-        public int compare(Point2D arg0, Point2D arg1) {
-            int rv = 0;
-            float arg0X = Utils.round(arg0.getX(), 2);
-            float arg0Y = Utils.round(arg0.getY(), 2);
-            float arg1X = Utils.round(arg1.getX(), 2);
-            float arg1Y = Utils.round(arg1.getY(), 2);
+        int rv = 0;
+        float arg0X = Utils.round(arg0.getX(), 2);
+        float arg0Y = Utils.round(arg0.getY(), 2);
+        float arg1X = Utils.round(arg1.getX(), 2);
+        float arg1Y = Utils.round(arg1.getY(), 2);

-
-            if (arg0Y > arg1Y) {
-                rv = 1;
-            } else if (arg0Y < arg1Y) {
-                rv = -1;
-            } else if (arg0X > arg1X) {
-                rv = 1;
-            } else if (arg0X < arg1X) {
-                rv = -1;
-            }
-            return rv;
+        if (arg0Y > arg1Y) {
+            rv = 1;
+        } else if (arg0Y < arg1Y) {
+            rv = -1;
+        } else if (arg0X > arg1X) {
+            rv = 1;
+        } else if (arg0X < arg1X) {
+            rv = -1;
        }
+        return rv;
    };

-
    private enum Direction {
-        HORIZONTAL,
-        VERTICAL
+        HORIZONTAL, VERTICAL
    }

    static class PolygonVertex {
+
        Point2D point;
        Direction direction;

-        public PolygonVertex(Point2D point, Direction direction) {
+
+        PolygonVertex(Point2D point, Direction direction) {
+
            this.direction = direction;
            this.point = point;
        }

+
        @Override
        public boolean equals(Object other) {
+
            if (this == other) {
                return true;
            }
@ -316,15 +304,21 @@ public class TableExtractionService {
            return this.point.equals(((PolygonVertex) other).point);
        }

+
        @Override
        public int hashCode() {
+
            return this.point.hashCode();
        }

+
        @Override
        public String toString() {
-            return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
+
+            return String.format("%s[point=%s,direction=%s]", this.getClass()
+                    .getName(), this.point.toString(), this.direction.toString());
        }
+
    }

 }
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java
@ -227,6 +227,7 @@ public class RedactionIntegrationTest {
    @Test
    public void noExceptionShouldBeThrownForAnyFiles() throws IOException {

+        System.out.println("noExceptionShouldBeThrownForAnyFiles");
        ClassLoader loader = getClass().getClassLoader();
        URL url = loader.getResource("files");
        File[] files = new File(url.getPath()).listFiles();
@ -266,6 +267,7 @@ public class RedactionIntegrationTest {
    @Test
    public void redactionTest() throws IOException {

+        System.out.println("redactionTest");
        long start = System.currentTimeMillis();
        ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/96 Trinexapac-ethyl_RAR_09_Volume_3CA_B-7_2018-02-23.pdf");

@ -289,8 +291,9 @@ public class RedactionIntegrationTest {
    @Test
    public void testTableRedaction() throws IOException {

+        System.out.println("testTableRedaction");
        long start = System.currentTimeMillis();
-        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
+        ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");

        RedactionRequest request = RedactionRequest.builder()
                .document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -311,6 +314,7 @@ public class RedactionIntegrationTest {
    @Test
    public void testManualRedaction() throws IOException {

+        System.out.println("testManualRedaction");
        long start = System.currentTimeMillis();
        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");

@ -345,6 +349,7 @@ public class RedactionIntegrationTest {
    @Test
    public void classificationTest() throws IOException {

+        System.out.println("classificationTest");
        ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " +
                "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");

@ -363,6 +368,7 @@ public class RedactionIntegrationTest {
    @Test
    public void sectionsTest() throws IOException {

+        System.out.println("sectionsTest");
        ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " +
                "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");

@ -381,6 +387,7 @@ public class RedactionIntegrationTest {
    @Test
    public void htmlTablesTest() throws IOException {

+        System.out.println("htmlTablesTest");
        ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " +
                "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");

@ -399,6 +406,7 @@ public class RedactionIntegrationTest {
    @Test
    public void htmlTableRotationTest() throws IOException {

+        System.out.println("htmlTableRotationTest");
        ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S" +
                "-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");

--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
@ -10,13 +10,16 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicLong;

 import org.apache.commons.io.IOUtils;
 import org.apache.pdfbox.pdmodel.PDDocument;
+import org.junit.Before;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.kie.api.KieServices;
@ -45,14 +48,15 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
 import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
 import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;

-@RunWith(SpringRunner.class)
@SpringBootTest
+@RunWith(SpringRunner.class)
 public class EntityRedactionServiceTest {

    private static final String DEFAULT_RULES = loadFromClassPath("drools/rules.drl");
    private static final String NAME_CODE = "name";
    private static final String ADDRESS_CODE = "address";

+    private static final AtomicLong DICTIONARY_VERSION = new AtomicLong();
    @MockBean
    private DictionaryClient dictionaryClient;

@ -112,6 +116,111 @@ public class EntityRedactionServiceTest {
                .document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
                .build();

+        DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
+                .entries(Arrays.asList("Casey, H.W.", "O’Loughlin,  C.K.", "Salamon, C.M.", "Smith, S.H."))
+                .build();
+        when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
+        when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
+        DictionaryResponse addressResponse = DictionaryResponse.builder()
+                .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))
+                .build();
+        when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
+
+        try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
+            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+            entityRedactionService.processDocument(classifiedDoc, null);
+            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+            assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
+        }
+    }
+
+
+    @Test
+    public void testTrueNegativesInTable() throws IOException {
+
+        ClassPathResource pdfFileResource = new ClassPathResource("files/Cyprodinil/40 Cyprodinil - EU AIR3 - LCA Section 1" +
+                " Supplement - Identity of the active substance - Reference list.pdf");
+        when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
+        DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
+                .entries(new ArrayList<>(ResourceLoader.load("dictionaries/names.txt")))
+                .build();
+        when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
+        DictionaryResponse addressResponse = DictionaryResponse.builder()
+                .entries(new ArrayList<>(ResourceLoader.load("dictionaries/addresses.txt")))
+                .build();
+        when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
+        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
+            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+            entityRedactionService.processDocument(classifiedDoc, null);
+            assertThat(classifiedDoc.getEntities()
+                    .entrySet()
+                    .stream()
+                    .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
+        }
+        pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " +
+                "the plant protection product.pdf");
+        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
+            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+            entityRedactionService.processDocument(classifiedDoc, null);
+            assertThat(classifiedDoc.getEntities()
+                    .entrySet()
+                    .stream()
+                    .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
+        }
+    }
+
+    @Test
+    public void testFalsePositiveInWrongCell() throws IOException {
+
+        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf");
+        when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
+        DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
+                .entries(new ArrayList<>(ResourceLoader.load("dictionaries/names.txt")))
+                .build();
+        when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
+        DictionaryResponse addressResponse = DictionaryResponse.builder()
+                .entries(new ArrayList<>(ResourceLoader.load("dictionaries/addresses.txt")))
+                .build();
+        when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
+        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
+            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+            entityRedactionService.processDocument(classifiedDoc, null);
+            assertThat(classifiedDoc.getEntities()).hasSize(1); // two pages
+            assertThat(classifiedDoc.getEntities().get(1).stream()
+                    .filter(entity -> entity.getMatchedRule() == 9)
+                    .count()).isEqualTo(10);
+        }
+
+    }
+
+    @Test
+    public void headerPropagation() throws IOException {
+
+        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation.pdf");
+
+        DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
+                .entries(Arrays.asList("Bissig R.", "Thanei P."))
+                .build();
+
+        when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
+        when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
+        DictionaryResponse addressResponse = DictionaryResponse.builder()
+                .entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))
+                .build();
+        when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
+
+        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
+            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+            entityRedactionService.processDocument(classifiedDoc, null);
+            assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
+            assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
+            assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(4);
+        }
+    }
+
+
+    @Before
+    public void stubRedaction() {
        String tableRules = "package drools\n" +
                "\n" +
                "import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
@ -119,10 +228,7 @@ public class EntityRedactionServiceTest {
                "global Section section\n" +
                "rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" +
                "    when\n" +
-                "        Section(tabularData != null && tabularData.size() > 0\n" +
-                "            && tabularData.containsKey(\"Vertebrate study Y/N\")\n" +
-                "            && tabularData.get(\"Vertebrate study Y/N\").equals(\"Y\")\n" +
-                "        )\n" +
+                "        Section(isVertebrateStudy())\n" +
                "    then\n" +
                "        section.redact(\"name\", 9, \"Redacted because row is a vertebrate study\");\n" +
                "        section.redact(\"address\", 9, \"Redacted because rows is a vertebrate study\");\n" +
@ -135,22 +241,9 @@ public class EntityRedactionServiceTest {
                        TypeResult.builder().type(NAME_CODE).color(new float[]{1, 1, 0}).build(),
                        TypeResult.builder().type(ADDRESS_CODE).color(new float[]{0, 1, 1}).build()))
                .build();
+        when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
        when(dictionaryClient.getAllTypes()).thenReturn(typeResponse);
-        DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
-                .entries(Arrays.asList("Casey, H.W.", "O’Loughlin,  C.K.", "Salamon, C.M.", "Smith, S.H."))
-                .build();
-        when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
-        DictionaryResponse addressResponse = DictionaryResponse.builder()
-                .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))
-                .build();
-        when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
        when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor());
-        try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, null);
-            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
-            assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
-        }
    }


--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java
@ -0,0 +1,67 @@
+package com.iqser.red.service.redaction.v1.server.segmentation;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.kie.api.runtime.KieContainer;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.boot.test.mock.mockito.MockBean;
+import org.springframework.core.io.ClassPathResource;
+import org.springframework.test.context.junit4.SpringRunner;
+
+import com.iqser.red.service.redaction.v1.server.classification.model.Document;
+import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
+import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
+import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
+import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
+
+@SpringBootTest
+@RunWith(SpringRunner.class)
+public class PdfSegmentationServiceTest {
+
+    @Autowired
+    private PdfSegmentationService pdfSegmentationService;
+
+    @Autowired
+    private RulingCleaningService rulingCleaningService;
+
+    @Autowired
+    private TableExtractionService tableExtractionService;
+
+    @Autowired
+    private BlockificationService blockificationService;
+
+    @MockBean
+    private KieContainer kieContainer;
+
+
+    @Test
+    public void testPDFSegmentationWithComplexTable() throws IOException {
+
+        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
+
+        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
+            Document document = pdfSegmentationService.parseDocument(pdDocument);
+            assertThat(document.getParagraphs()
+                    .stream()
+                    .flatMap(paragraph -> paragraph.getTables().stream())
+                    .collect(Collectors.toList())).isNotEmpty();
+            Table table = document.getParagraphs()
+                    .stream()
+                    .flatMap(paragraph -> paragraph.getTables().stream())
+                    .collect(Collectors.toList())
+                    .get(0);
+            assertThat(table.getColCount()).isEqualTo(6);
+            assertThat(table.getRowCount()).isEqualTo(13);
+            assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
+        }
+    }
+
+}
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl
@ -49,64 +49,69 @@ rule "5: Do not redact in guideline sections"
 		section.redactNot("address", 5, "Section is a guideline section.");
 	end

-rule "6: Redact if must redact entry is found"
-	when
-		eval(section.contains("must_redact")==true);
-	then
-		section.redact("name", 6, "must_redact entry was found.");
-		section.redact("address", 6, "must_redact entry was found.");
-	end
-
-
-rule "7: Redact contact information, if applicant is found"
+rule "6: Redact contact information, if applicant is found"
 	when
 		eval(section.headlineContainsWord("applicant") || section.getText().contains("Applicant"));
 	then
-		section.redactLineAfter("Name:", "address", 7, "Applicant information was found");
-		section.redactBetween("Address:", "Contact", "address", 7, "Applicant information was found");
-		section.redactLineAfter("Contact point:",  "address", 7, "Applicant information was found");
-		section.redactLineAfter("Phone:", "address", 7, "Applicant information was found");
-		section.redactLineAfter("Fax:", "address", 7, "Applicant information was found");
-		section.redactLineAfter("Tel.:", "address", 7, "Applicant information was found");
-		section.redactLineAfter("Tel:", "address", 7, "Applicant information was found");
-		section.redactLineAfter("E-mail:", "address", 7, "Applicant information was found");
-		section.redactLineAfter("Email:", "address", 7, "Applicant information was found");
-		section.redactLineAfter("Contact:", "address", 7, "Applicant information was found");
-		section.redactLineAfter("Telephone number:", "address", 7, "Applicant information was found");
-		section.redactLineAfter("Fax number:", "address", 7, "Applicant information was found");
-		section.redactLineAfter("Telephone:", "address", 7, "Applicant information was found");
-		section.redactBetween("No:", "Fax", "address", 7, "Applicant information was found");
-		section.redactBetween("Contact:", "Tel.:", "address", 7, "Applicant information was found");
+		section.redactLineAfter("Name:", "address", 6, "Applicant information was found");
+		section.redactBetween("Address:", "Contact", "address", 6, "Applicant information was found");
+		section.redactLineAfter("Contact point:",  "address", 6, "Applicant information was found");
+		section.redactLineAfter("Phone:", "address", 6, "Applicant information was found");
+		section.redactLineAfter("Fax:", "address", 6, "Applicant information was found");
+		section.redactLineAfter("Tel.:", "address", 6, "Applicant information was found");
+		section.redactLineAfter("Tel:", "address", 6, "Applicant information was found");
+		section.redactLineAfter("E-mail:", "address", 6, "Applicant information was found");
+		section.redactLineAfter("Email:", "address", 6, "Applicant information was found");
+		section.redactLineAfter("Contact:", "address", 6, "Applicant information was found");
+		section.redactLineAfter("Telephone number:", "address", 6, "Applicant information was found");
+		section.redactLineAfter("Fax number:", "address", 6, "Applicant information was found");
+		section.redactLineAfter("Telephone:", "address", 6, "Applicant information was found");
+		section.redactBetween("No:", "Fax", "address", 6, "Applicant information was found");
+		section.redactBetween("Contact:", "Tel.:", "address", 6, "Applicant information was found");
 	end

-rule "8: Redact contact information, if Producer is found"
+rule "7: Redact contact information, if Producer is found"
 	when
 		eval(section.getText().toLowerCase().contains("producer of the plant protection") || section.getText().toLowerCase().contains("producer of the active substance") || section.getText().contains("Manufacturer of the active substance") || section.getText().contains("Manufacturer:") || section.getText().contains("Producer or producers of the active substance"));
 	then
-		section.redactLineAfter("Name:", "address", 8, "Producer was found");
-		section.redactBetween("Address:", "Contact", "address", 8, "Producer was found");
-		section.redactBetween("Contact:", "Phone", "address", 8, "Producer was found");
-		section.redactBetween("Contact:", "Telephone number:", "address", 8, "Producer was found");
-		section.redactBetween("Address:", "Manufacturing", "address", 8, "Producer was found");
-		section.redactLineAfter("Telephone:", "address", 8, "Producer was found");
-		section.redactLineAfter("Phone:", "address", 8, "Producer was found");
-		section.redactLineAfter("Fax:", "address", 8, "Producer was found");
-		section.redactLineAfter("E-mail:", "address", 8, "Producer was found");
-		section.redactLineAfter("Contact:", "address", 8, "Producer was found");
-		section.redactLineAfter("Fax number:", "address", 8, "Producer was found");
-		section.redactLineAfter("Telephone number:", "address", 8, "Producer was found");
-		section.redactLineAfter("Tel:", "address", 8, "Producer was found");
-		section.redactBetween("No:", "Fax", "address", 8, "Producer was found");
+		section.redactLineAfter("Name:", "address", 7, "Producer was found");
+		section.redactBetween("Address:", "Contact", "address", 7, "Producer was found");
+		section.redactBetween("Contact:", "Phone", "address", 7, "Producer was found");
+		section.redactBetween("Contact:", "Telephone number:", "address", 7, "Producer was found");
+		section.redactBetween("Address:", "Manufacturing", "address", 7, "Producer was found");
+		section.redactLineAfter("Telephone:", "address", 7, "Producer was found");
+		section.redactLineAfter("Phone:", "address", 7, "Producer was found");
+		section.redactLineAfter("Fax:", "address", 7, "Producer was found");
+		section.redactLineAfter("E-mail:", "address", 7, "Producer was found");
+		section.redactLineAfter("Contact:", "address", 7, "Producer was found");
+		section.redactLineAfter("Fax number:", "address", 7, "Producer was found");
+		section.redactLineAfter("Telephone number:", "address", 7, "Producer was found");
+		section.redactLineAfter("Tel:", "address", 7, "Producer was found");
+		section.redactBetween("No:", "Fax", "address", 7, "Producer was found");
 	end

-rule "9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study"
+rule "8: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study"
    when
-        Section(tabularData != null
-            && tabularData.containsKey("Vertebrate study Y/N")
-            && tabularData.get("Vertebrate study Y/N").equals("Y")
-        )
+        Section(isVertebrateStudy())
    then
-        section.redact("name", 9, "Redacted because row is a vertebrate study");
-        section.redact("address", 9, "Redacted because rows is a vertebrate study");
+        section.redact("name", 8, "Redacted because row is a vertebrate study");
+        section.redact("address", 8, "Redacted because row is a vertebrate study");
        section.highlightCell("Vertebrate study Y/N", 9);
-    end
+    end
+
+rule "9: Not redacted because Vertebrate Study = N"
+    when
+        Section(isNotVertebrateStudy())
+    then
+        section.redactNot("name", 9, "Not redacted because row is not a vertebrate study");
+        section.redactNot("address", 9, "Not redacted because row is not a vertebrate study");
+    end
+
+
+rule "10: Redact if must redact entry is found"
+	when
+		eval(section.contains("must_redact")==true);
+	then
+		section.redact("name", 10, "must_redact entry was found.");
+		section.redact("address", 10, "must_redact entry was found.");
+	end
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal