diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
index 1cacb00e..a250b4a6 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java
@@ -51,24 +51,26 @@ public class EntityRedactionService {
List
tables = paragraph.getTables();
for (Table table : tables) {
- List metadata = table.getHeaders();
for (List row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
- List cellValues = new ArrayList<>();
+ Map tabularData = new HashMap<>();
for (Cell cell : row) {
- if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks())) {
- cellValues.add(null);
+ if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
- cellValues.add(cell.getTextBlocks().get(0).getText());
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
+ cell.getHeaderCells().forEach(headerCell -> {
+ String headerName = headerCell.getTextBlocks().get(0).getText()
+ .replaceAll("\n", " ")
+ .replaceAll(" ", " ");
+ tabularData.put(headerName, cell.getTextBlocks().get(0).getText());
+ });
for (TextBlock textBlock : cell.getTextBlocks()) {
searchableRow.addAll(textBlock.getSequences());
}
}
Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);
- Map tabularData = toMap(metadata, cellValues);
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
.entities(rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
@@ -116,26 +118,6 @@ public class EntityRedactionService {
}
- private Map toMap(List keys, List values) {
-
- if (keys.size() != values.size()) {
- log.warn("Cannot merge lists of unequal size, returning empty map.");
- return new HashMap<>();
- }
- Map result = new HashMap<>();
- for (int i = 0; i < keys.size(); i++) {
- String value = values.get(i);
- if (value == null) {
- continue;
- }
- result.put(keys.get(i), value);
- }
-
- return result;
-
- }
-
-
private Set clearAndFindPositions(Set entities, SearchableText text) {
removeEntitiesContainedInLarger(entities);
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java
index 8d005ccd..a5893161 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java
@@ -1,11 +1,11 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
-import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
@@ -13,6 +13,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
+import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@Service
@@ -85,10 +86,20 @@ public class SectionsBuilderService {
table.setHeadline("Table in: " + lastHeadline);
}
// Distribute header information for subsequent tables
- if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable) &&
- (previousTable.isVerticalHeader() && previousTable.getRowCount() == table.getRowCount() ||
- previousTable.getColCount() == table.getColCount())) {
- table.setHeaders(previousTable.getHeaders());
+ if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable)) {
+ List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
+ List tableNonHeaderRow = getRowWithNonHeaderCells(table);
+ if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
+ for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
+ List row = table.getRows().get(i);
+ if (row.size() == previousTableNonHeaderRow.size()
+ && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
+ for (int j = 0; j < row.size(); j++) {
+ row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
+ }
+ }
+ }
+ }
}
if (textBlock != null && !alreadyAdded) {
@@ -141,14 +152,32 @@ public class SectionsBuilderService {
private boolean hasInvalidHeaderInformation(Table table) {
- if (CollectionUtils.isEmpty(table.getHeaders())) {
- return true;
- }
- if (table.getHeaders().stream().anyMatch(StringUtils::isEmpty)) {
- return true;
+ return table.getRows().stream()
+ .flatMap(row -> row.stream()
+ .filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
+ .findAny()
+ .isEmpty();
+
+ }
+
+
+ private List getRowWithNonHeaderCells(Table table) {
+
+ for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
+ List row = table.getRows().get(i);
+ boolean allNonHeader = true;
+ for (Cell cell : row) {
+ if (cell.isHeaderCell()) {
+ allNonHeader = false;
+ break;
+ }
+ }
+ if (allNonHeader) {
+ return row;
+ }
}
- return false;
+ return Collections.emptyList();
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java
index 9342533b..6b884f69 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java
@@ -16,10 +16,14 @@ public class Cell extends Rectangle {
private List textBlocks = new ArrayList<>();
+ private List headerCells = new ArrayList<>();
+
+ private boolean isHeaderCell;
public Cell(Point2D topLeft, Point2D bottomRight) {
- super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight
+ super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()),
+ (float) (bottomRight
.getY() - topLeft.getY()));
}
@@ -29,4 +33,4 @@ public class Cell extends Rectangle {
textBlocks.add(textBlock);
}
-}
+}
\ No newline at end of file
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java
new file mode 100644
index 00000000..70a9800c
--- /dev/null
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java
@@ -0,0 +1,22 @@
+package com.iqser.red.service.redaction.v1.server.tableextraction.model;
+
+import lombok.RequiredArgsConstructor;
+import lombok.Value;
+
+@Value
+@RequiredArgsConstructor
+public class CellPosition implements Comparable {
+
+ int row;
+
+ int col;
+
+
+ @Override
+ public int compareTo(CellPosition other) {
+
+ int rowDiff = row - other.row;
+ return rowDiff != 0 ? rowDiff : col - other.col;
+ }
+
+}
\ No newline at end of file
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java
index 14d2f7d2..4e5628de 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java
@@ -8,12 +8,10 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
-import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
-import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.Getter;
@@ -41,12 +39,6 @@ public class Table extends AbstractTextContainer {
private List> rows;
- @Getter
- @Setter
- private List headers;
-
- @Getter
- private boolean verticalHeader;
public Table(List cells, Rectangle area, int rotation) {
@@ -65,7 +57,7 @@ public class Table extends AbstractTextContainer {
if (rows == null) {
rows = computeRows();
- headers = computeHeaders();
+ computeHeaders();
}
return rows;
@@ -78,72 +70,105 @@ public class Table extends AbstractTextContainer {
* Column is marked as header if cell text is bold and row cell text is not bold.
* Defaults to row.
*/
- private List computeHeaders() {
+ private void computeHeaders() {
- boolean allBold = true;
- if (rows.isEmpty()) {
- return Collections.emptyList();
- }
- List rowCells = rows.get(0);
- for (Cell cell : rowCells) {
- if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks()) ||
- !cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
- allBold = false;
- break;
- }
- }
- if (!allBold) {
- allBold = true;
- List firstColCells = new ArrayList<>();
- for (List row : rows) {
- Cell firstInRow = row.get(0);
- if (firstInRow == null || CollectionUtils.isEmpty(firstInRow.getTextBlocks()) ||
- !firstInRow.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
- allBold = false;
+ // A bold cell is a header cell as long as every cell to the left/top is bold, too
+ cells.forEach((position, cell) -> {
+ List cellsToTheLeft = getCellsToTheLeft(position);
+ Cell lastHeaderCell = null;
+ for (Cell leftCell : cellsToTheLeft) {
+ if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks()
+ .get(0)
+ .getMostPopularWordStyle()
+ .equals("bold")) {
+ lastHeaderCell = leftCell;
+ } else {
break;
}
- firstColCells.add(firstInRow);
}
- if (allBold) {
- log.info("Headers are in first column");
- verticalHeader = true;
- return firstColCells.stream().map(cell -> {
- if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) {
- return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText())
- .replaceAll("\n", " ")
- .replaceAll(" ", " ");
- } else {
- return null;
- }
- }).collect(Collectors.toList());
- } else {
- log.info("Headers are defaulted in first row.");
- return rowCells.stream().map(cell -> {
- if (cell != null && CollectionUtils.isNotEmpty(cell.getTextBlocks())) {
- return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText())
- .replaceAll("\n", " ")
- .replaceAll(" ", " ");
- } else {
- return null;
- }
- }).collect(Collectors.toList());
+ if (lastHeaderCell != null) {
+ cell.getHeaderCells().add(lastHeaderCell);
}
- } else {
- log.info("Headers are in first row.");
- return rowCells.stream().map(cell -> {
- if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) {
- return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText())
- .replaceAll("\n", " ")
- .replaceAll(" ", " ");
+ lastHeaderCell = null;
+ List cellsToTheTop = getCellToTheTop(position);
+ for (Cell topCell : cellsToTheTop) {
+ if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks()
+ .get(0)
+ .getMostPopularWordStyle()
+ .equals("bold")) {
+ lastHeaderCell = topCell;
} else {
- return null;
+ break;
}
- }).collect(Collectors.toList());
- }
+ }
+ if (lastHeaderCell != null) {
+ cell.getHeaderCells().add(lastHeaderCell);
+ }
+ if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
+ .get(0)
+ .getMostPopularWordStyle()
+ .equals("bold")) {
+ cell.setHeaderCell(true);
+ }
+ });
}
+ private List getCellsToTheLeft(CellPosition cellPosition) {
+
+ List result = new ArrayList<>();
+ if (cellPosition.getCol() == 0) {
+ return result;
+ }
+ int row = cellPosition.getRow();
+ for (int i = cellPosition.getCol() - 1; i >= 0; i--) {
+ if (cells.get(new CellPosition(row, i)) != null) {
+ result.add(cells.get(new CellPosition(row, i)));
+ } else {
+ Cell spanningCell = null;
+ while (spanningCell == null && row >= 0) {
+ row--;
+ spanningCell = cells.get(new CellPosition(row, i));
+ }
+ if (spanningCell != null) {
+ result.add(spanningCell);
+ }
+ row = cellPosition.getRow();
+ }
+ }
+ Collections.reverse(result);
+ return result;
+ }
+
+
+ private List getCellToTheTop(CellPosition cellPosition) {
+
+ List result = new ArrayList<>();
+ if (cellPosition.getRow() == 0) {
+ return result;
+ }
+ int col = cellPosition.getCol();
+ for (int i = cellPosition.getRow() - 1; i >= 0; i--) {
+ if (cells.get(new CellPosition(i, col)) != null) {
+ result.add(cells.get(new CellPosition(i, col)));
+ } else {
+ Cell spanningCell = null;
+ while (spanningCell == null && col >= 0) {
+ col--;
+ spanningCell = cells.get(new CellPosition(i, col));
+ }
+ if (spanningCell != null) {
+ result.add(spanningCell);
+ }
+ col = cellPosition.getCol();
+ }
+ }
+ Collections.reverse(result);
+ return result;
+ }
+
+
private List> computeRows() {
List> rows = new ArrayList<>();
@@ -152,7 +177,9 @@ public class Table extends AbstractTextContainer {
List lastRow = new ArrayList<>();
for (int j = rowCount - 1; j >= 0; j--) { // cols
Cell cell = cells.get(new CellPosition(j, i));
- lastRow.add(cell);
+ if (cell != null) {
+ lastRow.add(cell);
+ }
}
rows.add(lastRow);
}
@@ -161,7 +188,9 @@ public class Table extends AbstractTextContainer {
List lastRow = new ArrayList<>();
for (int j = 0; j < rowCount; j++) { // cols
Cell cell = cells.get(new CellPosition(i, j));
- lastRow.add(cell);
+ if (cell != null) {
+ lastRow.add(cell);
+ }
}
rows.add(lastRow);
}
@@ -170,7 +199,9 @@ public class Table extends AbstractTextContainer {
List lastRow = new ArrayList<>();
for (int j = 0; j < colCount; j++) {
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
- lastRow.add(cell);
+ if (cell != null) {
+ lastRow.add(cell);
+ }
}
rows.add(lastRow);
}
@@ -216,53 +247,62 @@ public class Table extends AbstractTextContainer {
List row = rowsOfCells.get(i);
Iterator rowCells = row.iterator();
int startColumn = 0;
- int jumpToColumn = 0;
+// int jumpToColumn = 0;
while (rowCells.hasNext()) {
Cell cell = rowCells.next();
if (i > 0) {
- List> others = rowsOfCells(si.contains(new Rectangle(cell.getBottom(), si.getBounds()
- .getLeft(), cell.getLeft() - si.getBounds().getLeft() + 1, si.getBounds().getBottom() - cell
- .getBottom())));
+// Rectangle rectangle = new Rectangle(cell.getBottom(),
+// si.getBounds().getLeft(),
+// cell.getLeft() - si.getBounds().getLeft() + 1,
+// si.getBounds().getBottom() - cell.getBottom());
+// List> others = rowsOfCells(si.contains(rectangle));
+//
+// for (List r : others) {
+// jumpToColumn = Math.max(jumpToColumn, r.size());
+// }
+//
+// while (startColumn != jumpToColumn) {
+// add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn);
+// startColumn++;
+// }
- for (List r : others) {
- jumpToColumn = Math.max(jumpToColumn, r.size());
+ // Handle cells spanning several rows
+ while (previousNonNullCellForColumnIndex.get(startColumn) != null) {
+ Cell previouslyAddedCellForSameColumn = previousNonNullCellForColumnIndex.get(startColumn);
+ float previousRight = previouslyAddedCellForSameColumn.getRight();
+ float thisLeft = cell.getLeft();
+ if (previousRight > thisLeft) {
+ break;
+ }
+ startColumn++;
}
}
-
- while (startColumn != jumpToColumn) {
- add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn);
- startColumn++;
- }
-
add(cell, i, startColumn);
previousNonNullCellForColumnIndex.put(startColumn, cell);
startColumn++;
- jumpToColumn = startColumn;
+// jumpToColumn = startColumn;
}
}
}
- private static List> rowsOfCells(List cells) {
+ private List> rowsOfCells(List cells) {
- Cell c;
- float lastTop;
List> rv = new ArrayList<>();
- List lastRow;
if (cells.isEmpty()) {
return rv;
}
-
cells.sort(Comparator.comparingDouble(Rectangle::getLeft));
- cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2), Utils.round(arg1
+ cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2),
+ Utils.round(arg1
.getBottom(), 2))));
Iterator iter = cells.iterator();
- c = iter.next();
- lastTop = c.getBottom();
- lastRow = new ArrayList<>();
+ Cell c = iter.next();
+ float lastTop = c.getBottom();
+ List lastRow = new ArrayList<>();
lastRow.add(c);
rv.add(lastRow);
@@ -349,51 +389,4 @@ public class Table extends AbstractTextContainer {
return sb.toString();
}
-
- static class CellPosition implements Comparable {
-
- CellPosition(int row, int col) {
-
- this.row = row;
- this.col = col;
- }
-
-
- final int row;
- final int col;
-
-
- @Override
- public int hashCode() {
-
- return row + 101 * col;
- }
-
-
- @Override
- public boolean equals(Object obj) {
-
- if (this == obj) {
- return true;
- }
- if (obj == null) {
- return false;
- }
- if (getClass() != obj.getClass()) {
- return false;
- }
- CellPosition other = (CellPosition) obj;
- return row == other.row && col == other.col;
- }
-
-
- @Override
- public int compareTo(CellPosition other) {
-
- int rowDiff = row - other.row;
- return rowDiff != 0 ? rowDiff : col - other.col;
- }
-
- }
-
}
\ No newline at end of file
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java
index cb8ae9ce..69c2fe69 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java
@@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.service;
import java.awt.geom.Point2D;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
@@ -25,26 +24,28 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
@Service
-@SuppressWarnings("all")
public class TableExtractionService {
- public void extractTables(CleanRulings cleanRulings, Page page){
+ public void extractTables(CleanRulings cleanRulings, Page page) {
List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
- Iterator itty = page.getTextBlocks().iterator();
- while (itty.hasNext()) {
- TextBlock textBlock = (TextBlock) itty.next();
+ for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) {
+ TextBlock textBlock = (TextBlock) abstractTextContainer;
for (Cell cell : cells) {
- if (cell.intersects(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight())) {
+ if (cell.intersects(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(),
+ textBlock.getHeight())) {
cell.addTextBlock(textBlock);
break;
}
}
}
- List spreadsheetAreas = findSpreadsheetsFromCells(cells)
- .stream()
+ cells = new ArrayList<>(new HashSet<>(cells));
+ Utils.sort(cells, Rectangle.ILL_DEFINED_ORDER);
+
+
+ List spreadsheetAreas = findSpreadsheetsFromCells(cells).stream()
.filter(r -> r.getWidth() > 0f && r.getHeight() > 0f)
.collect(Collectors.toList());
@@ -63,9 +64,9 @@ public class TableExtractionService {
for (Table table : tables) {
int position = -1;
- itty = page.getTextBlocks().iterator();
+ Iterator itty = page.getTextBlocks().iterator();
while (itty.hasNext()) {
- AbstractTextContainer textBlock = (AbstractTextContainer) itty.next();
+ AbstractTextContainer textBlock = itty.next();
if (table.contains(textBlock)) {
if (position == -1) {
position = page.getTextBlocks().indexOf(textBlock);
@@ -79,17 +80,18 @@ public class TableExtractionService {
}
}
+
public List findCells(List horizontalRulingLines, List verticalRulingLines) {
+
List| cellsFound = new ArrayList<>();
- Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
+ Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines,
+ verticalRulingLines);
List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
- Collections.sort(intersectionPointsList, POINT_COMPARATOR);
- boolean doBreak;
+ intersectionPointsList.sort(POINT_COMPARATOR);
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft);
- doBreak = false;
// CrossingPointsDirectlyBelow( topLeft );
List xPoints = new ArrayList<>();
@@ -106,10 +108,6 @@ public class TableExtractionService {
}
outer:
for (Point2D xPoint : xPoints) {
- if (doBreak) {
- break;
- }
-
// is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
continue;
@@ -120,11 +118,9 @@ public class TableExtractionService {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
- if (intersectionPoints.containsKey(btmRight)
- && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
- && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
+ if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints
+ .get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
cellsFound.add(new Cell(topLeft, btmRight));
- doBreak = true;
break outer;
}
}
@@ -139,7 +135,7 @@ public class TableExtractionService {
}
- public List findSpreadsheetsFromCells(List extends Rectangle> cells) {
+ private List findSpreadsheetsFromCells(List extends Rectangle> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List rectangles = new ArrayList<>();
Set pointSet = new HashSet<>();
@@ -147,10 +143,6 @@ public class TableExtractionService {
Map edgesV = new HashMap<>();
int i = 0;
- cells = new ArrayList<>(new HashSet<>(cells));
-
- Utils.sort(cells, Rectangle.ILL_DEFINED_ORDER);
-
for (Rectangle cell : cells) {
for (Point2D pt : cell.getPoints()) {
if (pointSet.contains(pt)) { // shared vertex, remove it
@@ -163,10 +155,10 @@ public class TableExtractionService {
// X first sort
List pointsSortX = new ArrayList<>(pointSet);
- Collections.sort(pointsSortX, X_FIRST_POINT_COMPARATOR);
+ pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
// Y first sort
List pointsSortY = new ArrayList<>(pointSet);
- Collections.sort(pointsSortY, POINT_COMPARATOR);
+ pointsSortY.sort(POINT_COMPARATOR);
while (i < pointSet.size()) {
float currY = (float) pointsSortY.get(i).getY();
@@ -203,13 +195,12 @@ public class TableExtractionService {
nextVertex = edgesV.get(curr.point);
edgesV.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
- polygon.add(lastAddedVertex);
} else {
nextVertex = edgesH.get(curr.point);
edgesH.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
- polygon.add(lastAddedVertex);
}
+ polygon.add(lastAddedVertex);
if (lastAddedVertex.equals(polygon.get(0))) {
// closed polygon
@@ -227,10 +218,10 @@ public class TableExtractionService {
// calculate grid-aligned minimum area rectangles for each found polygon
for (List poly : polygons) {
- float top = java.lang.Float.MAX_VALUE;
- float left = java.lang.Float.MAX_VALUE;
- float bottom = java.lang.Float.MIN_VALUE;
- float right = java.lang.Float.MIN_VALUE;
+ float top = Float.MAX_VALUE;
+ float left = Float.MAX_VALUE;
+ float bottom = Float.MIN_VALUE;
+ float right = Float.MIN_VALUE;
for (PolygonVertex pt : poly) {
top = (float) Math.min(top, pt.point.getY());
left = (float) Math.min(left, pt.point.getX());
@@ -244,69 +235,66 @@ public class TableExtractionService {
}
- private static final Comparator X_FIRST_POINT_COMPARATOR = new Comparator() {
- @Override
- public int compare(Point2D arg0, Point2D arg1) {
- int rv = 0;
- float arg0X = Utils.round(arg0.getX(), 2);
- float arg0Y = Utils.round(arg0.getY(), 2);
- float arg1X = Utils.round(arg1.getX(), 2);
- float arg1Y = Utils.round(arg1.getY(), 2);
+ private static final Comparator X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
- if (arg0X > arg1X) {
- rv = 1;
- } else if (arg0X < arg1X) {
- rv = -1;
- } else if (arg0Y > arg1Y) {
- rv = 1;
- } else if (arg0Y < arg1Y) {
- rv = -1;
- }
- return rv;
+ int rv = 0;
+ float arg0X = Utils.round(arg0.getX(), 2);
+ float arg0Y = Utils.round(arg0.getY(), 2);
+ float arg1X = Utils.round(arg1.getX(), 2);
+ float arg1Y = Utils.round(arg1.getY(), 2);
+
+ if (arg0X > arg1X) {
+ rv = 1;
+ } else if (arg0X < arg1X) {
+ rv = -1;
+ } else if (arg0Y > arg1Y) {
+ rv = 1;
+ } else if (arg0Y < arg1Y) {
+ rv = -1;
}
+ return rv;
};
+ private static final Comparator POINT_COMPARATOR = (arg0, arg1) -> {
- private static final Comparator POINT_COMPARATOR = new Comparator() {
- @Override
- public int compare(Point2D arg0, Point2D arg1) {
- int rv = 0;
- float arg0X = Utils.round(arg0.getX(), 2);
- float arg0Y = Utils.round(arg0.getY(), 2);
- float arg1X = Utils.round(arg1.getX(), 2);
- float arg1Y = Utils.round(arg1.getY(), 2);
+ int rv = 0;
+ float arg0X = Utils.round(arg0.getX(), 2);
+ float arg0Y = Utils.round(arg0.getY(), 2);
+ float arg1X = Utils.round(arg1.getX(), 2);
+ float arg1Y = Utils.round(arg1.getY(), 2);
-
- if (arg0Y > arg1Y) {
- rv = 1;
- } else if (arg0Y < arg1Y) {
- rv = -1;
- } else if (arg0X > arg1X) {
- rv = 1;
- } else if (arg0X < arg1X) {
- rv = -1;
- }
- return rv;
+ if (arg0Y > arg1Y) {
+ rv = 1;
+ } else if (arg0Y < arg1Y) {
+ rv = -1;
+ } else if (arg0X > arg1X) {
+ rv = 1;
+ } else if (arg0X < arg1X) {
+ rv = -1;
}
+ return rv;
};
-
private enum Direction {
- HORIZONTAL,
- VERTICAL
+ HORIZONTAL, VERTICAL
}
static class PolygonVertex {
+
Point2D point;
Direction direction;
- public PolygonVertex(Point2D point, Direction direction) {
+
+ PolygonVertex(Point2D point, Direction direction) {
+
this.direction = direction;
this.point = point;
}
+
@Override
public boolean equals(Object other) {
+
if (this == other) {
return true;
}
@@ -316,15 +304,21 @@ public class TableExtractionService {
return this.point.equals(((PolygonVertex) other).point);
}
+
@Override
public int hashCode() {
+
return this.point.hashCode();
}
+
@Override
public String toString() {
- return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
+
+ return String.format("%s[point=%s,direction=%s]", this.getClass()
+ .getName(), this.point.toString(), this.direction.toString());
}
+
}
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
index ed2999c6..4f046a38 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
@@ -10,6 +10,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
@@ -17,6 +18,7 @@ import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.KieServices;
@@ -112,6 +114,75 @@ public class EntityRedactionServiceTest {
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
+ DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
+ .entries(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))
+ .build();
+ when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
+ DictionaryResponse addressResponse = DictionaryResponse.builder()
+ .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))
+ .build();
+ when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
+
+ try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
+ Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+ entityRedactionService.processDocument(classifiedDoc, null);
+ assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+ assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
+ }
+ }
+
+
+ @Test
+ public void complexTable() throws IOException {
+
+ ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Complex Table.pdf");
+
+ RedactionRequest redactionRequest = RedactionRequest.builder()
+ .document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
+ .build();
+
+ when(dictionaryClient.getDictionaryForType(ADDRESS_CODE))
+ .thenReturn(DictionaryResponse.builder().entries(new ArrayList<>()).build());
+ when(dictionaryClient.getDictionaryForType(NAME_CODE))
+ .thenReturn(DictionaryResponse.builder().entries(new ArrayList<>()).build());
+
+ try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
+ Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+ entityRedactionService.processDocument(classifiedDoc, null);
+ }
+ }
+
+
+ @Test
+ public void headerPropagation() throws IOException {
+
+ ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation.pdf");
+
+ RedactionRequest redactionRequest = RedactionRequest.builder()
+ .document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
+ .build();
+
+ DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
+ .entries(Arrays.asList("Bissig R.", "Thanei P."))
+ .build();
+ when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
+ DictionaryResponse addressResponse = DictionaryResponse.builder()
+ .entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))
+ .build();
+ when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
+
+ try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
+ Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+ entityRedactionService.processDocument(classifiedDoc, null);
+ assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
+ assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
+ assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(4); // FIXME including one false positive "Y"
+ }
+ }
+
+
+ @Before
+ public void stubRedaction() {
String tableRules = "package drools\n" +
"\n" +
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
@@ -119,7 +190,7 @@ public class EntityRedactionServiceTest {
"global Section section\n" +
"rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" +
" when\n" +
- " Section(tabularData != null && tabularData.size() > 0\n" +
+ " Section(tabularData != null\n" +
" && tabularData.containsKey(\"Vertebrate study Y/N\")\n" +
" && tabularData.get(\"Vertebrate study Y/N\").equals(\"Y\")\n" +
" )\n" +
@@ -136,24 +207,9 @@ public class EntityRedactionServiceTest {
TypeResult.builder().type(ADDRESS_CODE).color(new float[]{0, 1, 1}).build()))
.build();
when(dictionaryClient.getAllTypes()).thenReturn(typeResponse);
- DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
- .entries(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))
- .build();
- when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
- DictionaryResponse addressResponse = DictionaryResponse.builder()
- .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))
- .build();
- when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor());
- try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
- Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
- entityRedactionService.processDocument(classifiedDoc, null);
- assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
- assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
- }
}
-
private static String loadFromClassPath(String path) {
URL resource = ResourceLoader.class.getClassLoader().getResource(path);
diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Complex Table.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Complex Table.pdf
new file mode 100644
index 00000000..c482af2e
Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Complex Table.pdf differ
diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Header Propagation.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Header Propagation.pdf
new file mode 100644
index 00000000..357009a8
Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Header Propagation.pdf differ
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |