Make table structure detection more robust

This commit is contained in:
Thierry Göckel 2020-08-13 17:44:11 +02:00
parent 031f10435d
commit 4236fa05cc
9 changed files with 349 additions and 269 deletions

View File

@ -51,24 +51,26 @@ public class EntityRedactionService {
List<Table> tables = paragraph.getTables();
for (Table table : tables) {
List<String> metadata = table.getHeaders();
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
List<String> cellValues = new ArrayList<>();
Map<String, String> tabularData = new HashMap<>();
for (Cell cell : row) {
if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks())) {
cellValues.add(null);
if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
cellValues.add(cell.getTextBlocks().get(0).getText());
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
cell.getHeaderCells().forEach(headerCell -> {
String headerName = headerCell.getTextBlocks().get(0).getText()
.replaceAll("\n", " ")
.replaceAll(" ", " ");
tabularData.put(headerName, cell.getTextBlocks().get(0).getText());
});
for (TextBlock textBlock : cell.getTextBlocks()) {
searchableRow.addAll(textBlock.getSequences());
}
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);
Map<String, String> tabularData = toMap(metadata, cellValues);
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
.entities(rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
@ -116,26 +118,6 @@ public class EntityRedactionService {
}
private Map<String, String> toMap(List<String> keys, List<String> values) {
if (keys.size() != values.size()) {
log.warn("Cannot merge lists of unequal size, returning empty map.");
return new HashMap<>();
}
Map<String, String> result = new HashMap<>();
for (int i = 0; i < keys.size(); i++) {
String value = values.get(i);
if (value == null) {
continue;
}
result.put(keys.get(i), value);
}
return result;
}
private Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text) {
removeEntitiesContainedInLarger(entities);

View File

@ -1,11 +1,11 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
@ -13,6 +13,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@Service
@ -85,10 +86,20 @@ public class SectionsBuilderService {
table.setHeadline("Table in: " + lastHeadline);
}
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable) &&
(previousTable.isVerticalHeader() && previousTable.getRowCount() == table.getRowCount() ||
previousTable.getColCount() == table.getColCount())) {
table.setHeaders(previousTable.getHeaders());
if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable)) {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(table);
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
if (row.size() == previousTableNonHeaderRow.size()
&& row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
if (textBlock != null && !alreadyAdded) {
@ -141,14 +152,32 @@ public class SectionsBuilderService {
private boolean hasInvalidHeaderInformation(Table table) {
if (CollectionUtils.isEmpty(table.getHeaders())) {
return true;
}
if (table.getHeaders().stream().anyMatch(StringUtils::isEmpty)) {
return true;
return table.getRows().stream()
.flatMap(row -> row.stream()
.filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
.findAny()
.isEmpty();
}
private List<Cell> getRowWithNonHeaderCells(Table table) {
for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
boolean allNonHeader = true;
for (Cell cell : row) {
if (cell.isHeaderCell()) {
allNonHeader = false;
break;
}
}
if (allNonHeader) {
return row;
}
}
return false;
return Collections.emptyList();
}

View File

@ -16,10 +16,14 @@ public class Cell extends Rectangle {
private List<TextBlock> textBlocks = new ArrayList<>();
private List<Cell> headerCells = new ArrayList<>();
private boolean isHeaderCell;
public Cell(Point2D topLeft, Point2D bottomRight) {
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()),
(float) (bottomRight
.getY() - topLeft.getY()));
}
@ -29,4 +33,4 @@ public class Cell extends Rectangle {
textBlocks.add(textBlock);
}
}
}

View File

@ -0,0 +1,22 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import lombok.RequiredArgsConstructor;
import lombok.Value;
@Value
@RequiredArgsConstructor
public class CellPosition implements Comparable<CellPosition> {
int row;
int col;
@Override
public int compareTo(CellPosition other) {
int rowDiff = row - other.row;
return rowDiff != 0 ? rowDiff : col - other.col;
}
}

View File

@ -8,12 +8,10 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.Getter;
@ -41,12 +39,6 @@ public class Table extends AbstractTextContainer {
private List<List<Cell>> rows;
@Getter
@Setter
private List<String> headers;
@Getter
private boolean verticalHeader;
public Table(List<Cell> cells, Rectangle area, int rotation) {
@ -65,7 +57,7 @@ public class Table extends AbstractTextContainer {
if (rows == null) {
rows = computeRows();
headers = computeHeaders();
computeHeaders();
}
return rows;
@ -78,72 +70,105 @@ public class Table extends AbstractTextContainer {
* Column is marked as header if cell text is bold and row cell text is not bold.
* Defaults to row.
*/
private List<String> computeHeaders() {
private void computeHeaders() {
boolean allBold = true;
if (rows.isEmpty()) {
return Collections.emptyList();
}
List<Cell> rowCells = rows.get(0);
for (Cell cell : rowCells) {
if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks()) ||
!cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
allBold = false;
break;
}
}
if (!allBold) {
allBold = true;
List<Cell> firstColCells = new ArrayList<>();
for (List<Cell> row : rows) {
Cell firstInRow = row.get(0);
if (firstInRow == null || CollectionUtils.isEmpty(firstInRow.getTextBlocks()) ||
!firstInRow.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
allBold = false;
// A bold cell is a header cell as long as every cell to the left/top is bold, too
cells.forEach((position, cell) -> {
List<Cell> cellsToTheLeft = getCellsToTheLeft(position);
Cell lastHeaderCell = null;
for (Cell leftCell : cellsToTheLeft) {
if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks()
.get(0)
.getMostPopularWordStyle()
.equals("bold")) {
lastHeaderCell = leftCell;
} else {
break;
}
firstColCells.add(firstInRow);
}
if (allBold) {
log.info("Headers are in first column");
verticalHeader = true;
return firstColCells.stream().map(cell -> {
if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) {
return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText())
.replaceAll("\n", " ")
.replaceAll(" ", " ");
} else {
return null;
}
}).collect(Collectors.toList());
} else {
log.info("Headers are defaulted in first row.");
return rowCells.stream().map(cell -> {
if (cell != null && CollectionUtils.isNotEmpty(cell.getTextBlocks())) {
return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText())
.replaceAll("\n", " ")
.replaceAll(" ", " ");
} else {
return null;
}
}).collect(Collectors.toList());
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
} else {
log.info("Headers are in first row.");
return rowCells.stream().map(cell -> {
if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) {
return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText())
.replaceAll("\n", " ")
.replaceAll(" ", " ");
lastHeaderCell = null;
List<Cell> cellsToTheTop = getCellToTheTop(position);
for (Cell topCell : cellsToTheTop) {
if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks()
.get(0)
.getMostPopularWordStyle()
.equals("bold")) {
lastHeaderCell = topCell;
} else {
return null;
break;
}
}).collect(Collectors.toList());
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
.get(0)
.getMostPopularWordStyle()
.equals("bold")) {
cell.setHeaderCell(true);
}
});
}
private List<Cell> getCellsToTheLeft(CellPosition cellPosition) {
List<Cell> result = new ArrayList<>();
if (cellPosition.getCol() == 0) {
return result;
}
int row = cellPosition.getRow();
for (int i = cellPosition.getCol() - 1; i >= 0; i--) {
if (cells.get(new CellPosition(row, i)) != null) {
result.add(cells.get(new CellPosition(row, i)));
} else {
Cell spanningCell = null;
while (spanningCell == null && row >= 0) {
row--;
spanningCell = cells.get(new CellPosition(row, i));
}
if (spanningCell != null) {
result.add(spanningCell);
}
row = cellPosition.getRow();
}
}
Collections.reverse(result);
return result;
}
private List<Cell> getCellToTheTop(CellPosition cellPosition) {
List<Cell> result = new ArrayList<>();
if (cellPosition.getRow() == 0) {
return result;
}
int col = cellPosition.getCol();
for (int i = cellPosition.getRow() - 1; i >= 0; i--) {
if (cells.get(new CellPosition(i, col)) != null) {
result.add(cells.get(new CellPosition(i, col)));
} else {
Cell spanningCell = null;
while (spanningCell == null && col >= 0) {
col--;
spanningCell = cells.get(new CellPosition(i, col));
}
if (spanningCell != null) {
result.add(spanningCell);
}
col = cellPosition.getCol();
}
}
Collections.reverse(result);
return result;
}
private List<List<Cell>> computeRows() {
List<List<Cell>> rows = new ArrayList<>();
@ -152,7 +177,9 @@ public class Table extends AbstractTextContainer {
List<Cell> lastRow = new ArrayList<>();
for (int j = rowCount - 1; j >= 0; j--) { // cols
Cell cell = cells.get(new CellPosition(j, i));
lastRow.add(cell);
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
@ -161,7 +188,9 @@ public class Table extends AbstractTextContainer {
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < rowCount; j++) { // cols
Cell cell = cells.get(new CellPosition(i, j));
lastRow.add(cell);
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
@ -170,7 +199,9 @@ public class Table extends AbstractTextContainer {
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < colCount; j++) {
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
lastRow.add(cell);
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
@ -216,53 +247,62 @@ public class Table extends AbstractTextContainer {
List<Cell> row = rowsOfCells.get(i);
Iterator<Cell> rowCells = row.iterator();
int startColumn = 0;
int jumpToColumn = 0;
// int jumpToColumn = 0;
while (rowCells.hasNext()) {
Cell cell = rowCells.next();
if (i > 0) {
List<List<Cell>> others = rowsOfCells(si.contains(new Rectangle(cell.getBottom(), si.getBounds()
.getLeft(), cell.getLeft() - si.getBounds().getLeft() + 1, si.getBounds().getBottom() - cell
.getBottom())));
// Rectangle rectangle = new Rectangle(cell.getBottom(),
// si.getBounds().getLeft(),
// cell.getLeft() - si.getBounds().getLeft() + 1,
// si.getBounds().getBottom() - cell.getBottom());
// List<List<Cell>> others = rowsOfCells(si.contains(rectangle));
//
// for (List<Cell> r : others) {
// jumpToColumn = Math.max(jumpToColumn, r.size());
// }
//
// while (startColumn != jumpToColumn) {
// add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn);
// startColumn++;
// }
for (List<Cell> r : others) {
jumpToColumn = Math.max(jumpToColumn, r.size());
// Handle cells spanning several rows
while (previousNonNullCellForColumnIndex.get(startColumn) != null) {
Cell previouslyAddedCellForSameColumn = previousNonNullCellForColumnIndex.get(startColumn);
float previousRight = previouslyAddedCellForSameColumn.getRight();
float thisLeft = cell.getLeft();
if (previousRight > thisLeft) {
break;
}
startColumn++;
}
}
while (startColumn != jumpToColumn) {
add(previousNonNullCellForColumnIndex.get(startColumn), i, startColumn);
startColumn++;
}
add(cell, i, startColumn);
previousNonNullCellForColumnIndex.put(startColumn, cell);
startColumn++;
jumpToColumn = startColumn;
// jumpToColumn = startColumn;
}
}
}
private static List<List<Cell>> rowsOfCells(List<Cell> cells) {
private List<List<Cell>> rowsOfCells(List<Cell> cells) {
Cell c;
float lastTop;
List<List<Cell>> rv = new ArrayList<>();
List<Cell> lastRow;
if (cells.isEmpty()) {
return rv;
}
cells.sort(Comparator.comparingDouble(Rectangle::getLeft));
cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2), Utils.round(arg1
cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2),
Utils.round(arg1
.getBottom(), 2))));
Iterator<Cell> iter = cells.iterator();
c = iter.next();
lastTop = c.getBottom();
lastRow = new ArrayList<>();
Cell c = iter.next();
float lastTop = c.getBottom();
List<Cell> lastRow = new ArrayList<>();
lastRow.add(c);
rv.add(lastRow);
@ -349,51 +389,4 @@ public class Table extends AbstractTextContainer {
return sb.toString();
}
static class CellPosition implements Comparable<CellPosition> {
CellPosition(int row, int col) {
this.row = row;
this.col = col;
}
final int row;
final int col;
@Override
public int hashCode() {
return row + 101 * col;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
CellPosition other = (CellPosition) obj;
return row == other.row && col == other.col;
}
@Override
public int compareTo(CellPosition other) {
int rowDiff = row - other.row;
return rowDiff != 0 ? rowDiff : col - other.col;
}
}
}

View File

@ -2,7 +2,6 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.service;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
@ -25,26 +24,28 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
@Service
@SuppressWarnings("all")
public class TableExtractionService {
public void extractTables(CleanRulings cleanRulings, Page page){
public void extractTables(CleanRulings cleanRulings, Page page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
Iterator<AbstractTextContainer> itty = page.getTextBlocks().iterator();
while (itty.hasNext()) {
TextBlock textBlock = (TextBlock) itty.next();
for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) {
TextBlock textBlock = (TextBlock) abstractTextContainer;
for (Cell cell : cells) {
if (cell.intersects(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight())) {
if (cell.intersects(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(),
textBlock.getHeight())) {
cell.addTextBlock(textBlock);
break;
}
}
}
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells)
.stream()
cells = new ArrayList<>(new HashSet<>(cells));
Utils.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream()
.filter(r -> r.getWidth() > 0f && r.getHeight() > 0f)
.collect(Collectors.toList());
@ -63,9 +64,9 @@ public class TableExtractionService {
for (Table table : tables) {
int position = -1;
itty = page.getTextBlocks().iterator();
Iterator<AbstractTextContainer> itty = page.getTextBlocks().iterator();
while (itty.hasNext()) {
AbstractTextContainer textBlock = (AbstractTextContainer) itty.next();
AbstractTextContainer textBlock = itty.next();
if (table.contains(textBlock)) {
if (position == -1) {
position = page.getTextBlocks().indexOf(textBlock);
@ -79,17 +80,18 @@ public class TableExtractionService {
}
}
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<Cell> cellsFound = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines,
verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
Collections.sort(intersectionPointsList, POINT_COMPARATOR);
boolean doBreak;
intersectionPointsList.sort(POINT_COMPARATOR);
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft);
doBreak = false;
// CrossingPointsDirectlyBelow( topLeft );
List<Point2D> xPoints = new ArrayList<>();
@ -106,10 +108,6 @@ public class TableExtractionService {
}
outer:
for (Point2D xPoint : xPoints) {
if (doBreak) {
break;
}
// is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
continue;
@ -120,11 +118,9 @@ public class TableExtractionService {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight)
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints
.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
cellsFound.add(new Cell(topLeft, btmRight));
doBreak = true;
break outer;
}
}
@ -139,7 +135,7 @@ public class TableExtractionService {
}
public List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<>();
Set<Point2D> pointSet = new HashSet<>();
@ -147,10 +143,6 @@ public class TableExtractionService {
Map<Point2D, Point2D> edgesV = new HashMap<>();
int i = 0;
cells = new ArrayList<>(new HashSet<>(cells));
Utils.sort(cells, Rectangle.ILL_DEFINED_ORDER);
for (Rectangle cell : cells) {
for (Point2D pt : cell.getPoints()) {
if (pointSet.contains(pt)) { // shared vertex, remove it
@ -163,10 +155,10 @@ public class TableExtractionService {
// X first sort
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
Collections.sort(pointsSortX, X_FIRST_POINT_COMPARATOR);
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
// Y first sort
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
Collections.sort(pointsSortY, POINT_COMPARATOR);
pointsSortY.sort(POINT_COMPARATOR);
while (i < pointSet.size()) {
float currY = (float) pointsSortY.get(i).getY();
@ -203,13 +195,12 @@ public class TableExtractionService {
nextVertex = edgesV.get(curr.point);
edgesV.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
polygon.add(lastAddedVertex);
} else {
nextVertex = edgesH.get(curr.point);
edgesH.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
polygon.add(lastAddedVertex);
}
polygon.add(lastAddedVertex);
if (lastAddedVertex.equals(polygon.get(0))) {
// closed polygon
@ -227,10 +218,10 @@ public class TableExtractionService {
// calculate grid-aligned minimum area rectangles for each found polygon
for (List<PolygonVertex> poly : polygons) {
float top = java.lang.Float.MAX_VALUE;
float left = java.lang.Float.MAX_VALUE;
float bottom = java.lang.Float.MIN_VALUE;
float right = java.lang.Float.MIN_VALUE;
float top = Float.MAX_VALUE;
float left = Float.MAX_VALUE;
float bottom = Float.MIN_VALUE;
float right = Float.MIN_VALUE;
for (PolygonVertex pt : poly) {
top = (float) Math.min(top, pt.point.getY());
left = (float) Math.min(left, pt.point.getX());
@ -244,69 +235,66 @@ public class TableExtractionService {
}
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = new Comparator<Point2D>() {
@Override
public int compare(Point2D arg0, Point2D arg1) {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
} else if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
} else if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
};
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
private static final Comparator<Point2D> POINT_COMPARATOR = new Comparator<Point2D>() {
@Override
public int compare(Point2D arg0, Point2D arg1) {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
} else if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
}
return rv;
if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
} else if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
}
return rv;
};
private enum Direction {
HORIZONTAL,
VERTICAL
HORIZONTAL, VERTICAL
}
static class PolygonVertex {
Point2D point;
Direction direction;
public PolygonVertex(Point2D point, Direction direction) {
PolygonVertex(Point2D point, Direction direction) {
this.direction = direction;
this.point = point;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
@ -316,15 +304,21 @@ public class TableExtractionService {
return this.point.equals(((PolygonVertex) other).point);
}
@Override
public int hashCode() {
return this.point.hashCode();
}
@Override
public String toString() {
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
return String.format("%s[point=%s,direction=%s]", this.getClass()
.getName(), this.point.toString(), this.direction.toString());
}
}
}

View File

@ -10,6 +10,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
@ -17,6 +18,7 @@ import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.KieServices;
@ -112,6 +114,75 @@ public class EntityRedactionServiceTest {
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))
.build();
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
}
}
@Test
public void complexTable() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Complex Table.pdf");
RedactionRequest redactionRequest = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE))
.thenReturn(DictionaryResponse.builder().entries(new ArrayList<>()).build());
when(dictionaryClient.getDictionaryForType(NAME_CODE))
.thenReturn(DictionaryResponse.builder().entries(new ArrayList<>()).build());
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
}
}
@Test
public void headerPropagation() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation.pdf");
RedactionRequest redactionRequest = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(Arrays.asList("Bissig R.", "Thanei P."))
.build();
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(4); // FIXME including one false positive "Y"
}
}
@Before
public void stubRedaction() {
String tableRules = "package drools\n" +
"\n" +
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
@ -119,7 +190,7 @@ public class EntityRedactionServiceTest {
"global Section section\n" +
"rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" +
" when\n" +
" Section(tabularData != null && tabularData.size() > 0\n" +
" Section(tabularData != null\n" +
" && tabularData.containsKey(\"Vertebrate study Y/N\")\n" +
" && tabularData.get(\"Vertebrate study Y/N\").equals(\"Y\")\n" +
" )\n" +
@ -136,24 +207,9 @@ public class EntityRedactionServiceTest {
TypeResult.builder().type(ADDRESS_CODE).color(new float[]{0, 1, 1}).build()))
.build();
when(dictionaryClient.getAllTypes()).thenReturn(typeResponse);
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))
.build();
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor());
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
}
}
private static String loadFromClassPath(String path) {
URL resource = ResourceLoader.class.getClassLoader().getResource(path);