diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java index 776ae1b0..7f7493b8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java @@ -3,6 +3,8 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import java.util.ArrayList; import java.util.List; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; + import lombok.Data; import lombok.EqualsAndHashCode; @@ -16,6 +18,7 @@ public class Entity { private boolean redaction; private String redactionReason; private List positionSequences = new ArrayList<>(); + private List targetSequences; private Integer start; private Integer end; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index 41ee4548..8652bf57 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -1,6 +1,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.regex.Pattern; @@ -25,9 +26,16 @@ public class SearchableText { } - @SuppressWarnings("checkstyle:ModifiedControlVariable") public List getSequences(String searchString, boolean caseInsensitive) { + return getSequences(searchString, caseInsensitive, null); + + } + + @SuppressWarnings("checkstyle:ModifiedControlVariable") + public List getSequences(String searchString, boolean caseInsensitive, + List sequencesSubList) { + String normalizedSearchString; if (caseInsensitive) { normalizedSearchString = searchString.toLowerCase(); @@ -40,37 +48,50 @@ public class SearchableText { List crossSequenceParts = new ArrayList<>(); List finalMatches = new ArrayList<>(); - for (int i = 0; i < sequences.size(); i++) { - TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage()); - for (int j = 0; j < sequences.get(i).length(); j++) { - if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInsensitive) == ' ' && sequences.get(i - 1) - .charAt(sequences.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && sequences.get(i) - .charAt(j, caseInsensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInsensitive) == ' ') { - if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) { + List searchSpace; + if (sequencesSubList != null) { + int subListIndex = Collections.indexOfSubList(sequences, sequencesSubList); + if (subListIndex != -1) { + searchSpace = sequences.subList(subListIndex, subListIndex + sequencesSubList.size()); + } else { + searchSpace = sequences; + } + } else { + searchSpace = sequences; + } + + for (int i = 0; i < searchSpace.size(); i++) { + TextPositionSequence partMatch = new TextPositionSequence(searchSpace.get(i).getPage()); + for (int j = 0; j < searchSpace.get(i).length(); j++) { + + if (i > 0 && j == 0 && searchSpace.get(i).charAt(0, caseInsensitive) == ' ' && searchSpace.get(i - 1) + .charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && searchSpace.get(i) + .charAt(j, caseInsensitive) == ' ' && searchSpace.get(i).charAt(j - 1, caseInsensitive) == ' ') { + if (j == searchSpace.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) { crossSequenceParts.add(partMatch); } continue; } - if (j == 0 && sequences.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && sequences.get(i - 1) - .charAt(sequences.get(i - 1) + if (j == 0 && searchSpace.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && searchSpace.get(i - 1) + .charAt(searchSpace.get(i - 1) .length() - 1, caseInsensitive) != ' ' && searchChars[counter] == ' ') { counter++; } - if (sequences.get(i) - .charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && sequences.get(i) + if (searchSpace.get(i) + .charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && searchSpace.get(i) .charAt(j, caseInsensitive) == '-') { - if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i) - .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1) - .charAt(sequences.get(i - 1) - .length() - 1, caseInsensitive)) || j == 0 && i != 0 && sequences.get(i - 1) - .charAt(sequences.get(i - 1).length() - 1, caseInsensitive) != ' ' && sequences.get(i) + if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(searchSpace.get(i) + .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(searchSpace.get(i - 1) + .charAt(searchSpace.get(i - 1) + .length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1) + .charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i) .charAt(j, caseInsensitive) != ' ') { - partMatch.add(sequences.get(i).textPositionAt(j)); - if (!(j == sequences.get(i).length() - 1 && sequences.get(i) + partMatch.add(searchSpace.get(i).textPositionAt(j)); + if (!(j == searchSpace.get(i).length() - 1 && searchSpace.get(i) .charAt(j, caseInsensitive) == '-' && searchChars[counter] != '-')) { counter++; } @@ -79,19 +100,19 @@ public class SearchableText { if (counter == searchString.length()) { crossSequenceParts.add(partMatch); - if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i) - .length() - 1 && isSeparator(sequences.get(i) - .charAt(j + 1, caseInsensitive)) || j == sequences.get(i) - .length() - 1 && isSeparator(sequences.get(i + 1) - .charAt(0, caseInsensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i) - .charAt(j, caseInsensitive) != ' ' && sequences.get(i + 1) + if (i == searchSpace.size() - 1 && j == searchSpace.get(i).length() - 1 || j != searchSpace.get(i) + .length() - 1 && isSeparator(searchSpace.get(i) + .charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i) + .length() - 1 && isSeparator(searchSpace.get(i + 1) + .charAt(0, caseInsensitive)) || j == searchSpace.get(i).length() - 1 && searchSpace.get(i) + .charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1) .charAt(0, caseInsensitive) != ' ') { finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts)); } counter = 0; crossSequenceParts = new ArrayList<>(); - partMatch = new TextPositionSequence(sequences.get(i).getPage()); + partMatch = new TextPositionSequence(searchSpace.get(i).getPage()); } } else { counter = 0; @@ -99,16 +120,17 @@ public class SearchableText { j--; } crossSequenceParts = new ArrayList<>(); - partMatch = new TextPositionSequence(sequences.get(i).getPage()); + partMatch = new TextPositionSequence(searchSpace.get(i).getPage()); } - if (j == sequences.get(i).length() - 1 && counter != 0) { + if (j == searchSpace.get(i).length() - 1 && counter != 0) { crossSequenceParts.add(partMatch); } } } return finalMatches; + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index dd6f9419..ca571a84 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -7,9 +7,10 @@ import java.util.Map; import java.util.Set; import java.util.regex.Pattern; -import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; + import lombok.Builder; import lombok.Data; import lombok.extern.slf4j.Slf4j; @@ -31,7 +32,21 @@ public class Section { private int sectionNumber; - private Map tabularData; + private Map tabularData; + + + public boolean isVertebrateStudy() { + return tabularData != null + && tabularData.containsKey("Vertebrate study Y/N") + && tabularData.get("Vertebrate study Y/N").getText().equals("Y"); + } + + + public boolean isNotVertebrateStudy() { + return tabularData != null + && tabularData.containsKey("Vertebrate study Y/N") + && tabularData.get("Vertebrate study Y/N").getText().equals("N"); + } public boolean contains(String type) { @@ -163,20 +178,16 @@ public class Section { public void highlightCell(String cellHeader, int ruleNumber) { - String value = tabularData.get(cellHeader); + TextBlock value = tabularData.get(cellHeader); if (value == null) { log.warn("Could not find any data for {}.", cellHeader); } else { - Set found = findEntities(value, "must_redact"); - if (CollectionUtils.isEmpty(found)) { - log.warn("Could not identify value {} in row.", value); - } else { - Entity entity = found.iterator().next(); - entity.setRedaction(false); - entity.setMatchedRule(ruleNumber); - entity.setRedactionReason(cellHeader); - entities.add(entity); - } + Entity entity = new Entity(value.getText(), "must_redact", 0, value.getText().length(), headline, sectionNumber); + entity.setRedaction(false); + entity.setMatchedRule(ruleNumber); + entity.setRedactionReason(cellHeader); + entity.setTargetSequences(value.getSequences()); // Make sure no other cells with same content are highlighted + entities.add(entity); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 1cacb00e..9bfd27cd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -51,24 +51,27 @@ public class EntityRedactionService { List tables = paragraph.getTables(); for (Table table : tables) { - List metadata = table.getHeaders(); for (List row : table.getRows()) { SearchableText searchableRow = new SearchableText(); - List cellValues = new ArrayList<>(); + Map tabularData = new HashMap<>(); for (Cell cell : row) { - if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks())) { - cellValues.add(null); + if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } - cellValues.add(cell.getTextBlocks().get(0).getText()); addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber); + cell.getHeaderCells().forEach(headerCell -> { + String headerName = headerCell.getTextBlocks().get(0).getText() + .replaceAll("\n", " ") + .replaceAll(" ", " "); + tabularData.put(headerName, cell.getTextBlocks().get(0)); + }); for (TextBlock textBlock : cell.getTextBlocks()) { searchableRow.addAll(textBlock.getSequences()); } + } Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber); - Map tabularData = toMap(metadata, cellValues); Section analysedRowSection = droolsExecutionService.executeRules(Section.builder() .entities(rowEntities) .text(searchableRow.getAsStringWithLinebreaks()) @@ -116,35 +119,15 @@ public class EntityRedactionService { } - private Map toMap(List keys, List values) { - - if (keys.size() != values.size()) { - log.warn("Cannot merge lists of unequal size, returning empty map."); - return new HashMap<>(); - } - Map result = new HashMap<>(); - for (int i = 0; i < keys.size(); i++) { - String value = values.get(i); - if (value == null) { - continue; - } - result.put(keys.get(i), value); - } - - return result; - - } - - private Set clearAndFindPositions(Set entities, SearchableText text) { removeEntitiesContainedInLarger(entities); for (Entity entity : entities) { if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) { - entity.setPositionSequences(text.getSequences(entity.getWord(), true)); + entity.setPositionSequences(text.getSequences(entity.getWord(), true, entity.getTargetSequences())); } else { - entity.setPositionSequences(text.getSequences(entity.getWord(), false)); + entity.setPositionSequences(text.getSequences(entity.getWord(), false, entity.getTargetSequences())); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 8d005ccd..a5893161 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -1,11 +1,11 @@ package com.iqser.red.service.redaction.v1.server.segmentation; import java.util.ArrayList; +import java.util.Collections; import java.util.Iterator; import java.util.List; import org.apache.commons.collections4.CollectionUtils; -import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Service; import com.iqser.red.service.redaction.v1.server.classification.model.Document; @@ -13,6 +13,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; @Service @@ -85,10 +86,20 @@ public class SectionsBuilderService { table.setHeadline("Table in: " + lastHeadline); } // Distribute header information for subsequent tables - if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable) && - (previousTable.isVerticalHeader() && previousTable.getRowCount() == table.getRowCount() || - previousTable.getColCount() == table.getColCount())) { - table.setHeaders(previousTable.getHeaders()); + if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable)) { + List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); + List tableNonHeaderRow = getRowWithNonHeaderCells(table); + if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { + for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + List row = table.getRows().get(i); + if (row.size() == previousTableNonHeaderRow.size() + && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) { + for (int j = 0; j < row.size(); j++) { + row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); + } + } + } + } } if (textBlock != null && !alreadyAdded) { @@ -141,14 +152,32 @@ public class SectionsBuilderService { private boolean hasInvalidHeaderInformation(Table table) { - if (CollectionUtils.isEmpty(table.getHeaders())) { - return true; - } - if (table.getHeaders().stream().anyMatch(StringUtils::isEmpty)) { - return true; + return table.getRows().stream() + .flatMap(row -> row.stream() + .filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))) + .findAny() + .isEmpty(); + + } + + + private List getRowWithNonHeaderCells(Table table) { + + for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + List row = table.getRows().get(i); + boolean allNonHeader = true; + for (Cell cell : row) { + if (cell.isHeaderCell()) { + allNonHeader = false; + break; + } + } + if (allNonHeader) { + return row; + } } - return false; + return Collections.emptyList(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java index 9342533b..6b884f69 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java @@ -16,10 +16,14 @@ public class Cell extends Rectangle { private List textBlocks = new ArrayList<>(); + private List headerCells = new ArrayList<>(); + + private boolean isHeaderCell; public Cell(Point2D topLeft, Point2D bottomRight) { - super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight + super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), + (float) (bottomRight .getY() - topLeft.getY())); } @@ -29,4 +33,4 @@ public class Cell extends Rectangle { textBlocks.add(textBlock); } -} +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java new file mode 100644 index 00000000..70a9800c --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java @@ -0,0 +1,22 @@ +package com.iqser.red.service.redaction.v1.server.tableextraction.model; + +import lombok.RequiredArgsConstructor; +import lombok.Value; + +@Value +@RequiredArgsConstructor +public class CellPosition implements Comparable { + + int row; + + int col; + + + @Override + public int compareTo(CellPosition other) { + + int rowDiff = row - other.row; + return rowDiff != 0 ? rowDiff : col - other.col; + } + +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java index 14d2f7d2..c118d0e0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java @@ -8,12 +8,10 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; -import java.util.stream.Collectors; import org.apache.commons.collections4.CollectionUtils; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; import lombok.Getter; @@ -41,12 +39,6 @@ public class Table extends AbstractTextContainer { private List> rows; - @Getter - @Setter - private List headers; - - @Getter - private boolean verticalHeader; public Table(List cells, Rectangle area, int rotation) { @@ -65,7 +57,7 @@ public class Table extends AbstractTextContainer { if (rows == null) { rows = computeRows(); - headers = computeHeaders(); + computeHeaders(); } return rows; @@ -78,72 +70,105 @@ public class Table extends AbstractTextContainer { * Column is marked as header if cell text is bold and row cell text is not bold. * Defaults to row. */ - private List computeHeaders() { + private void computeHeaders() { - boolean allBold = true; - if (rows.isEmpty()) { - return Collections.emptyList(); - } - List rowCells = rows.get(0); - for (Cell cell : rowCells) { - if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks()) || - !cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) { - allBold = false; - break; - } - } - if (!allBold) { - allBold = true; - List firstColCells = new ArrayList<>(); - for (List row : rows) { - Cell firstInRow = row.get(0); - if (firstInRow == null || CollectionUtils.isEmpty(firstInRow.getTextBlocks()) || - !firstInRow.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) { - allBold = false; + // A bold cell is a header cell as long as every cell to the left/top is bold, too + cells.forEach((position, cell) -> { + List cellsToTheLeft = getCellsToTheLeft(position); + Cell lastHeaderCell = null; + for (Cell leftCell : cellsToTheLeft) { + if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks() + .get(0) + .getMostPopularWordStyle() + .equals("bold")) { + lastHeaderCell = leftCell; + } else { break; } - firstColCells.add(firstInRow); } - if (allBold) { - log.info("Headers are in first column"); - verticalHeader = true; - return firstColCells.stream().map(cell -> { - if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) { - return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText()) - .replaceAll("\n", " ") - .replaceAll(" ", " "); - } else { - return null; - } - }).collect(Collectors.toList()); - } else { - log.info("Headers are defaulted in first row."); - return rowCells.stream().map(cell -> { - if (cell != null && CollectionUtils.isNotEmpty(cell.getTextBlocks())) { - return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText()) - .replaceAll("\n", " ") - .replaceAll(" ", " "); - } else { - return null; - } - }).collect(Collectors.toList()); + if (lastHeaderCell != null) { + cell.getHeaderCells().add(lastHeaderCell); } - } else { - log.info("Headers are in first row."); - return rowCells.stream().map(cell -> { - if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) { - return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText()) - .replaceAll("\n", " ") - .replaceAll(" ", " "); + lastHeaderCell = null; + List cellsToTheTop = getCellToTheTop(position); + for (Cell topCell : cellsToTheTop) { + if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks() + .get(0) + .getMostPopularWordStyle() + .equals("bold")) { + lastHeaderCell = topCell; } else { - return null; + break; } - }).collect(Collectors.toList()); - } + } + if (lastHeaderCell != null) { + cell.getHeaderCells().add(lastHeaderCell); + } + if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks() + .get(0) + .getMostPopularWordStyle() + .equals("bold")) { + cell.setHeaderCell(true); + } + }); } + private List getCellsToTheLeft(CellPosition cellPosition) { + + List result = new ArrayList<>(); + if (cellPosition.getCol() == 0) { + return result; + } + int row = cellPosition.getRow(); + for (int i = cellPosition.getCol() - 1; i >= 0; i--) { + if (cells.get(new CellPosition(row, i)) != null) { + result.add(cells.get(new CellPosition(row, i))); + } else { + Cell spanningCell = null; + while (spanningCell == null && row >= 0) { + row--; + spanningCell = cells.get(new CellPosition(row, i)); + } + if (spanningCell != null) { + result.add(spanningCell); + } + row = cellPosition.getRow(); + } + } + Collections.reverse(result); + return result; + } + + + private List getCellToTheTop(CellPosition cellPosition) { + + List result = new ArrayList<>(); + if (cellPosition.getRow() == 0) { + return result; + } + int col = cellPosition.getCol(); + for (int i = cellPosition.getRow() - 1; i >= 0; i--) { + if (cells.get(new CellPosition(i, col)) != null) { + result.add(cells.get(new CellPosition(i, col))); + } else { + Cell spanningCell = null; + while (spanningCell == null && col >= 0) { + col--; + spanningCell = cells.get(new CellPosition(i, col)); + } + if (spanningCell != null) { + result.add(spanningCell); + } + col = cellPosition.getCol(); + } + } + Collections.reverse(result); + return result; + } + + private List> computeRows() { List> rows = new ArrayList<>(); @@ -152,7 +177,9 @@ public class Table extends AbstractTextContainer { List lastRow = new ArrayList<>(); for (int j = rowCount - 1; j >= 0; j--) { // cols Cell cell = cells.get(new CellPosition(j, i)); - lastRow.add(cell); + if (cell != null) { + lastRow.add(cell); + } } rows.add(lastRow); } @@ -161,7 +188,9 @@ public class Table extends AbstractTextContainer { List lastRow = new ArrayList<>(); for (int j = 0; j < rowCount; j++) { // cols Cell cell = cells.get(new CellPosition(i, j)); - lastRow.add(cell); + if (cell != null) { + lastRow.add(cell); + } } rows.add(lastRow); } @@ -170,7 +199,9 @@ public class Table extends AbstractTextContainer { List lastRow = new ArrayList<>(); for (int j = 0; j < colCount; j++) { Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault() - lastRow.add(cell); + if (cell != null) { + lastRow.add(cell); + } } rows.add(lastRow); } @@ -220,20 +251,21 @@ public class Table extends AbstractTextContainer { while (rowCells.hasNext()) { Cell cell = rowCells.next(); if (i > 0) { - List> others = rowsOfCells(si.contains(new Rectangle(cell.getBottom(), si.getBounds() - .getLeft(), cell.getLeft() - si.getBounds().getLeft() + 1, si.getBounds().getBottom() - cell - .getBottom()))); + Rectangle rectangle = new Rectangle(cell.getBottom(), + si.getBounds().getLeft(), + cell.getLeft() - si.getBounds().getLeft() + 1, + si.getBounds().getBottom() - cell.getBottom()); + List> others = rowsOfCells(si.contains(rectangle)); for (List r : others) { jumpToColumn = Math.max(jumpToColumn, r.size()); } - } - while (startColumn != jumpToColumn) { - add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn); - startColumn++; + while (startColumn != jumpToColumn) { + add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn); + startColumn++; + } } - add(cell, i, startColumn); previousNonNullCellForColumnIndex.put(startColumn, cell); startColumn++; @@ -243,26 +275,23 @@ public class Table extends AbstractTextContainer { } - private static List> rowsOfCells(List cells) { + private List> rowsOfCells(List cells) { - Cell c; - float lastTop; List> rv = new ArrayList<>(); - List lastRow; if (cells.isEmpty()) { return rv; } - cells.sort(Comparator.comparingDouble(Rectangle::getLeft)); - cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2), Utils.round(arg1 + cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2), + Utils.round(arg1 .getBottom(), 2)))); Iterator iter = cells.iterator(); - c = iter.next(); - lastTop = c.getBottom(); - lastRow = new ArrayList<>(); + Cell c = iter.next(); + float lastTop = c.getBottom(); + List lastRow = new ArrayList<>(); lastRow.add(c); rv.add(lastRow); @@ -349,51 +378,4 @@ public class Table extends AbstractTextContainer { return sb.toString(); } - - static class CellPosition implements Comparable { - - CellPosition(int row, int col) { - - this.row = row; - this.col = col; - } - - - final int row; - final int col; - - - @Override - public int hashCode() { - - return row + 101 * col; - } - - - @Override - public boolean equals(Object obj) { - - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - CellPosition other = (CellPosition) obj; - return row == other.row && col == other.col; - } - - - @Override - public int compareTo(CellPosition other) { - - int rowDiff = row - other.row; - return rowDiff != 0 ? rowDiff : col - other.col; - } - - } - } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java index cb8ae9ce..69c2fe69 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java @@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.service; import java.awt.geom.Point2D; import java.util.ArrayList; -import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -25,26 +24,28 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; @Service -@SuppressWarnings("all") public class TableExtractionService { - public void extractTables(CleanRulings cleanRulings, Page page){ + public void extractTables(CleanRulings cleanRulings, Page page) { List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); - Iterator itty = page.getTextBlocks().iterator(); - while (itty.hasNext()) { - TextBlock textBlock = (TextBlock) itty.next(); + for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) { + TextBlock textBlock = (TextBlock) abstractTextContainer; for (Cell cell : cells) { - if (cell.intersects(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight())) { + if (cell.intersects(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), + textBlock.getHeight())) { cell.addTextBlock(textBlock); break; } } } - List spreadsheetAreas = findSpreadsheetsFromCells(cells) - .stream() + cells = new ArrayList<>(new HashSet<>(cells)); + Utils.sort(cells, Rectangle.ILL_DEFINED_ORDER); + + + List spreadsheetAreas = findSpreadsheetsFromCells(cells).stream() .filter(r -> r.getWidth() > 0f && r.getHeight() > 0f) .collect(Collectors.toList()); @@ -63,9 +64,9 @@ public class TableExtractionService { for (Table table : tables) { int position = -1; - itty = page.getTextBlocks().iterator(); + Iterator itty = page.getTextBlocks().iterator(); while (itty.hasNext()) { - AbstractTextContainer textBlock = (AbstractTextContainer) itty.next(); + AbstractTextContainer textBlock = itty.next(); if (table.contains(textBlock)) { if (position == -1) { position = page.getTextBlocks().indexOf(textBlock); @@ -79,17 +80,18 @@ public class TableExtractionService { } } + public List findCells(List horizontalRulingLines, List verticalRulingLines) { + List cellsFound = new ArrayList<>(); - Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); + Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, + verticalRulingLines); List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); - Collections.sort(intersectionPointsList, POINT_COMPARATOR); - boolean doBreak; + intersectionPointsList.sort(POINT_COMPARATOR); for (int i = 0; i < intersectionPointsList.size(); i++) { Point2D topLeft = intersectionPointsList.get(i); Ruling[] hv = intersectionPoints.get(topLeft); - doBreak = false; // CrossingPointsDirectlyBelow( topLeft ); List xPoints = new ArrayList<>(); @@ -106,10 +108,6 @@ public class TableExtractionService { } outer: for (Point2D xPoint : xPoints) { - if (doBreak) { - break; - } - // is there a vertical edge b/w topLeft and xPoint? if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) { continue; @@ -120,11 +118,9 @@ public class TableExtractionService { continue; } Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY()); - if (intersectionPoints.containsKey(btmRight) - && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) - && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) { + if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints + .get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) { cellsFound.add(new Cell(topLeft, btmRight)); - doBreak = true; break outer; } } @@ -139,7 +135,7 @@ public class TableExtractionService { } - public List findSpreadsheetsFromCells(List cells) { + private List findSpreadsheetsFromCells(List cells) { // via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon List rectangles = new ArrayList<>(); Set pointSet = new HashSet<>(); @@ -147,10 +143,6 @@ public class TableExtractionService { Map edgesV = new HashMap<>(); int i = 0; - cells = new ArrayList<>(new HashSet<>(cells)); - - Utils.sort(cells, Rectangle.ILL_DEFINED_ORDER); - for (Rectangle cell : cells) { for (Point2D pt : cell.getPoints()) { if (pointSet.contains(pt)) { // shared vertex, remove it @@ -163,10 +155,10 @@ public class TableExtractionService { // X first sort List pointsSortX = new ArrayList<>(pointSet); - Collections.sort(pointsSortX, X_FIRST_POINT_COMPARATOR); + pointsSortX.sort(X_FIRST_POINT_COMPARATOR); // Y first sort List pointsSortY = new ArrayList<>(pointSet); - Collections.sort(pointsSortY, POINT_COMPARATOR); + pointsSortY.sort(POINT_COMPARATOR); while (i < pointSet.size()) { float currY = (float) pointsSortY.get(i).getY(); @@ -203,13 +195,12 @@ public class TableExtractionService { nextVertex = edgesV.get(curr.point); edgesV.remove(curr.point); lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL); - polygon.add(lastAddedVertex); } else { nextVertex = edgesH.get(curr.point); edgesH.remove(curr.point); lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL); - polygon.add(lastAddedVertex); } + polygon.add(lastAddedVertex); if (lastAddedVertex.equals(polygon.get(0))) { // closed polygon @@ -227,10 +218,10 @@ public class TableExtractionService { // calculate grid-aligned minimum area rectangles for each found polygon for (List poly : polygons) { - float top = java.lang.Float.MAX_VALUE; - float left = java.lang.Float.MAX_VALUE; - float bottom = java.lang.Float.MIN_VALUE; - float right = java.lang.Float.MIN_VALUE; + float top = Float.MAX_VALUE; + float left = Float.MAX_VALUE; + float bottom = Float.MIN_VALUE; + float right = Float.MIN_VALUE; for (PolygonVertex pt : poly) { top = (float) Math.min(top, pt.point.getY()); left = (float) Math.min(left, pt.point.getX()); @@ -244,69 +235,66 @@ public class TableExtractionService { } - private static final Comparator X_FIRST_POINT_COMPARATOR = new Comparator() { - @Override - public int compare(Point2D arg0, Point2D arg1) { - int rv = 0; - float arg0X = Utils.round(arg0.getX(), 2); - float arg0Y = Utils.round(arg0.getY(), 2); - float arg1X = Utils.round(arg1.getX(), 2); - float arg1Y = Utils.round(arg1.getY(), 2); + private static final Comparator X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> { - if (arg0X > arg1X) { - rv = 1; - } else if (arg0X < arg1X) { - rv = -1; - } else if (arg0Y > arg1Y) { - rv = 1; - } else if (arg0Y < arg1Y) { - rv = -1; - } - return rv; + int rv = 0; + float arg0X = Utils.round(arg0.getX(), 2); + float arg0Y = Utils.round(arg0.getY(), 2); + float arg1X = Utils.round(arg1.getX(), 2); + float arg1Y = Utils.round(arg1.getY(), 2); + + if (arg0X > arg1X) { + rv = 1; + } else if (arg0X < arg1X) { + rv = -1; + } else if (arg0Y > arg1Y) { + rv = 1; + } else if (arg0Y < arg1Y) { + rv = -1; } + return rv; }; + private static final Comparator POINT_COMPARATOR = (arg0, arg1) -> { - private static final Comparator POINT_COMPARATOR = new Comparator() { - @Override - public int compare(Point2D arg0, Point2D arg1) { - int rv = 0; - float arg0X = Utils.round(arg0.getX(), 2); - float arg0Y = Utils.round(arg0.getY(), 2); - float arg1X = Utils.round(arg1.getX(), 2); - float arg1Y = Utils.round(arg1.getY(), 2); + int rv = 0; + float arg0X = Utils.round(arg0.getX(), 2); + float arg0Y = Utils.round(arg0.getY(), 2); + float arg1X = Utils.round(arg1.getX(), 2); + float arg1Y = Utils.round(arg1.getY(), 2); - - if (arg0Y > arg1Y) { - rv = 1; - } else if (arg0Y < arg1Y) { - rv = -1; - } else if (arg0X > arg1X) { - rv = 1; - } else if (arg0X < arg1X) { - rv = -1; - } - return rv; + if (arg0Y > arg1Y) { + rv = 1; + } else if (arg0Y < arg1Y) { + rv = -1; + } else if (arg0X > arg1X) { + rv = 1; + } else if (arg0X < arg1X) { + rv = -1; } + return rv; }; - private enum Direction { - HORIZONTAL, - VERTICAL + HORIZONTAL, VERTICAL } static class PolygonVertex { + Point2D point; Direction direction; - public PolygonVertex(Point2D point, Direction direction) { + + PolygonVertex(Point2D point, Direction direction) { + this.direction = direction; this.point = point; } + @Override public boolean equals(Object other) { + if (this == other) { return true; } @@ -316,15 +304,21 @@ public class TableExtractionService { return this.point.equals(((PolygonVertex) other).point); } + @Override public int hashCode() { + return this.point.hashCode(); } + @Override public String toString() { - return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString()); + + return String.format("%s[point=%s,direction=%s]", this.getClass() + .getName(), this.point.toString(), this.direction.toString()); } + } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 04003f7f..e771bb49 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -227,6 +227,7 @@ public class RedactionIntegrationTest { @Test public void noExceptionShouldBeThrownForAnyFiles() throws IOException { + System.out.println("noExceptionShouldBeThrownForAnyFiles"); ClassLoader loader = getClass().getClassLoader(); URL url = loader.getResource("files"); File[] files = new File(url.getPath()).listFiles(); @@ -266,6 +267,7 @@ public class RedactionIntegrationTest { @Test public void redactionTest() throws IOException { + System.out.println("redactionTest"); long start = System.currentTimeMillis(); ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/96 Trinexapac-ethyl_RAR_09_Volume_3CA_B-7_2018-02-23.pdf"); @@ -289,8 +291,9 @@ public class RedactionIntegrationTest { @Test public void testTableRedaction() throws IOException { + System.out.println("testTableRedaction"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) @@ -311,6 +314,7 @@ public class RedactionIntegrationTest { @Test public void testManualRedaction() throws IOException { + System.out.println("testManualRedaction"); long start = System.currentTimeMillis(); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf"); @@ -345,6 +349,7 @@ public class RedactionIntegrationTest { @Test public void classificationTest() throws IOException { + System.out.println("classificationTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); @@ -363,6 +368,7 @@ public class RedactionIntegrationTest { @Test public void sectionsTest() throws IOException { + System.out.println("sectionsTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); @@ -381,6 +387,7 @@ public class RedactionIntegrationTest { @Test public void htmlTablesTest() throws IOException { + System.out.println("htmlTablesTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); @@ -399,6 +406,7 @@ public class RedactionIntegrationTest { @Test public void htmlTableRotationTest() throws IOException { + System.out.println("htmlTableRotationTest"); ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S" + "-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index ed2999c6..625479a2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -10,13 +10,16 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.io.IOUtils; import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.kie.api.KieServices; @@ -45,14 +48,15 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; -@RunWith(SpringRunner.class) @SpringBootTest +@RunWith(SpringRunner.class) public class EntityRedactionServiceTest { private static final String DEFAULT_RULES = loadFromClassPath("drools/rules.drl"); private static final String NAME_CODE = "name"; private static final String ADDRESS_CODE = "address"; + private static final AtomicLong DICTIONARY_VERSION = new AtomicLong(); @MockBean private DictionaryClient dictionaryClient; @@ -112,6 +116,111 @@ public class EntityRedactionServiceTest { .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) .build(); + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")) + .build(); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + + try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1 + } + } + + + @Test + public void testTrueNegativesInTable() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Cyprodinil/40 Cyprodinil - EU AIR3 - LCA Section 1" + + " Supplement - Identity of the active substance - Reference list.pdf"); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/names.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/addresses.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities() + .entrySet() + .stream() + .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); + } + pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " + + "the plant protection product.pdf"); + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities() + .entrySet() + .stream() + .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); + } + } + + @Test + public void testFalsePositiveInWrongCell() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf"); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/names.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/addresses.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // two pages + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 9) + .count()).isEqualTo(10); + } + + } + + @Test + public void headerPropagation() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation.pdf"); + + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(Arrays.asList("Bissig R.", "Thanei P.")) + .build(); + + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8); + assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(4); + } + } + + + @Before + public void stubRedaction() { String tableRules = "package drools\n" + "\n" + "import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" + @@ -119,10 +228,7 @@ public class EntityRedactionServiceTest { "global Section section\n" + "rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" + " when\n" + - " Section(tabularData != null && tabularData.size() > 0\n" + - " && tabularData.containsKey(\"Vertebrate study Y/N\")\n" + - " && tabularData.get(\"Vertebrate study Y/N\").equals(\"Y\")\n" + - " )\n" + + " Section(isVertebrateStudy())\n" + " then\n" + " section.redact(\"name\", 9, \"Redacted because row is a vertebrate study\");\n" + " section.redact(\"address\", 9, \"Redacted because rows is a vertebrate study\");\n" + @@ -135,22 +241,9 @@ public class EntityRedactionServiceTest { TypeResult.builder().type(NAME_CODE).color(new float[]{1, 1, 0}).build(), TypeResult.builder().type(ADDRESS_CODE).color(new float[]{0, 1, 1}).build())) .build(); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); when(dictionaryClient.getAllTypes()).thenReturn(typeResponse); - DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")) - .build(); - when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); - DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")) - .build(); - when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor()); - try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1 - } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java new file mode 100644 index 00000000..537fa91b --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java @@ -0,0 +1,67 @@ +package com.iqser.red.service.redaction.v1.server.segmentation; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.kie.api.runtime.KieContainer; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.core.io.ClassPathResource; +import org.springframework.test.context.junit4.SpringRunner; + +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; +import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; +import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; + +@SpringBootTest +@RunWith(SpringRunner.class) +public class PdfSegmentationServiceTest { + + @Autowired + private PdfSegmentationService pdfSegmentationService; + + @Autowired + private RulingCleaningService rulingCleaningService; + + @Autowired + private TableExtractionService tableExtractionService; + + @Autowired + private BlockificationService blockificationService; + + @MockBean + private KieContainer kieContainer; + + + @Test + public void testPDFSegmentationWithComplexTable() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); + + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document document = pdfSegmentationService.parseDocument(pdDocument); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table table = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(table.getColCount()).isEqualTo(6); + assertThat(table.getRowCount()).isEqualTo(13); + assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); + } + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index e89e8eb7..faaeda84 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -49,64 +49,69 @@ rule "5: Do not redact in guideline sections" section.redactNot("address", 5, "Section is a guideline section."); end -rule "6: Redact if must redact entry is found" - when - eval(section.contains("must_redact")==true); - then - section.redact("name", 6, "must_redact entry was found."); - section.redact("address", 6, "must_redact entry was found."); - end - - -rule "7: Redact contact information, if applicant is found" +rule "6: Redact contact information, if applicant is found" when eval(section.headlineContainsWord("applicant") || section.getText().contains("Applicant")); then - section.redactLineAfter("Name:", "address", 7, "Applicant information was found"); - section.redactBetween("Address:", "Contact", "address", 7, "Applicant information was found"); - section.redactLineAfter("Contact point:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Phone:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Fax:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Tel.:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Tel:", "address", 7, "Applicant information was found"); - section.redactLineAfter("E-mail:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Email:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Contact:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Telephone number:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Fax number:", "address", 7, "Applicant information was found"); - section.redactLineAfter("Telephone:", "address", 7, "Applicant information was found"); - section.redactBetween("No:", "Fax", "address", 7, "Applicant information was found"); - section.redactBetween("Contact:", "Tel.:", "address", 7, "Applicant information was found"); + section.redactLineAfter("Name:", "address", 6, "Applicant information was found"); + section.redactBetween("Address:", "Contact", "address", 6, "Applicant information was found"); + section.redactLineAfter("Contact point:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Phone:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Fax:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Tel.:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Tel:", "address", 6, "Applicant information was found"); + section.redactLineAfter("E-mail:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Email:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Contact:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Telephone number:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Fax number:", "address", 6, "Applicant information was found"); + section.redactLineAfter("Telephone:", "address", 6, "Applicant information was found"); + section.redactBetween("No:", "Fax", "address", 6, "Applicant information was found"); + section.redactBetween("Contact:", "Tel.:", "address", 6, "Applicant information was found"); end -rule "8: Redact contact information, if Producer is found" +rule "7: Redact contact information, if Producer is found" when eval(section.getText().toLowerCase().contains("producer of the plant protection") || section.getText().toLowerCase().contains("producer of the active substance") || section.getText().contains("Manufacturer of the active substance") || section.getText().contains("Manufacturer:") || section.getText().contains("Producer or producers of the active substance")); then - section.redactLineAfter("Name:", "address", 8, "Producer was found"); - section.redactBetween("Address:", "Contact", "address", 8, "Producer was found"); - section.redactBetween("Contact:", "Phone", "address", 8, "Producer was found"); - section.redactBetween("Contact:", "Telephone number:", "address", 8, "Producer was found"); - section.redactBetween("Address:", "Manufacturing", "address", 8, "Producer was found"); - section.redactLineAfter("Telephone:", "address", 8, "Producer was found"); - section.redactLineAfter("Phone:", "address", 8, "Producer was found"); - section.redactLineAfter("Fax:", "address", 8, "Producer was found"); - section.redactLineAfter("E-mail:", "address", 8, "Producer was found"); - section.redactLineAfter("Contact:", "address", 8, "Producer was found"); - section.redactLineAfter("Fax number:", "address", 8, "Producer was found"); - section.redactLineAfter("Telephone number:", "address", 8, "Producer was found"); - section.redactLineAfter("Tel:", "address", 8, "Producer was found"); - section.redactBetween("No:", "Fax", "address", 8, "Producer was found"); + section.redactLineAfter("Name:", "address", 7, "Producer was found"); + section.redactBetween("Address:", "Contact", "address", 7, "Producer was found"); + section.redactBetween("Contact:", "Phone", "address", 7, "Producer was found"); + section.redactBetween("Contact:", "Telephone number:", "address", 7, "Producer was found"); + section.redactBetween("Address:", "Manufacturing", "address", 7, "Producer was found"); + section.redactLineAfter("Telephone:", "address", 7, "Producer was found"); + section.redactLineAfter("Phone:", "address", 7, "Producer was found"); + section.redactLineAfter("Fax:", "address", 7, "Producer was found"); + section.redactLineAfter("E-mail:", "address", 7, "Producer was found"); + section.redactLineAfter("Contact:", "address", 7, "Producer was found"); + section.redactLineAfter("Fax number:", "address", 7, "Producer was found"); + section.redactLineAfter("Telephone number:", "address", 7, "Producer was found"); + section.redactLineAfter("Tel:", "address", 7, "Producer was found"); + section.redactBetween("No:", "Fax", "address", 7, "Producer was found"); end -rule "9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study" +rule "8: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study" when - Section(tabularData != null - && tabularData.containsKey("Vertebrate study Y/N") - && tabularData.get("Vertebrate study Y/N").equals("Y") - ) + Section(isVertebrateStudy()) then - section.redact("name", 9, "Redacted because row is a vertebrate study"); - section.redact("address", 9, "Redacted because rows is a vertebrate study"); + section.redact("name", 8, "Redacted because row is a vertebrate study"); + section.redact("address", 8, "Redacted because row is a vertebrate study"); section.highlightCell("Vertebrate study Y/N", 9); - end \ No newline at end of file + end + +rule "9: Not redacted because Vertebrate Study = N" + when + Section(isNotVertebrateStudy()) + then + section.redactNot("name", 9, "Not redacted because row is not a vertebrate study"); + section.redactNot("address", 9, "Not redacted because row is not a vertebrate study"); + end + + +rule "10: Redact if must redact entry is found" + when + eval(section.contains("must_redact")==true); + then + section.redact("name", 10, "must_redact entry was found."); + section.redact("address", 10, "must_redact entry was found."); + end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Complex Table.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Complex Table.pdf new file mode 100644 index 00000000..c482af2e Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Complex Table.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Header Propagation.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Header Propagation.pdf new file mode 100644 index 00000000..357009a8 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Header Propagation.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Row With Ambiguous Redaction.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Row With Ambiguous Redaction.pdf new file mode 100644 index 00000000..4943f1a0 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Row With Ambiguous Redaction.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Spanning Cells.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Spanning Cells.pdf new file mode 100644 index 00000000..db5abbcd Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Spanning Cells.pdf differ