Fix entity span in table rows and detection of headers in rotated tables

This commit is contained in:
Thierry Göckel 2020-08-25 13:50:34 +02:00
parent 6483e637c6
commit 848c506c3f
8 changed files with 143 additions and 111 deletions

View File

@ -0,0 +1,16 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import lombok.RequiredArgsConstructor;
import lombok.Value;
@Value
@RequiredArgsConstructor
public class CellValue {
TextBlock textBlock;
int rowSpanStart;
}

View File

@ -32,7 +32,7 @@ public class Section {
private int sectionNumber;
private Map<String, TextBlock> tabularData;
private Map<String, CellValue> tabularData;
public boolean rowEquals(String headerName, String value){
@ -40,7 +40,8 @@ public class Section {
.replaceAll(" ", "")
.replaceAll("-", "");
return tabularData != null && tabularData.containsKey(cleanHeaderName) && tabularData.get(cleanHeaderName).getText().equals(value);
return tabularData != null && tabularData.containsKey(cleanHeaderName)
&& tabularData.get(cleanHeaderName).getTextBlock().getText().equals(value);
}
@ -177,15 +178,18 @@ public class Section {
.replaceAll(" ", "")
.replaceAll("-", "");
TextBlock value = tabularData.get(cleanHeaderName);
CellValue value = tabularData.get(cleanHeaderName);
if (value == null) {
log.warn("Could not find any data for {}.", cellHeader);
} else {
Entity entity = new Entity(value.getText(), type, 0, value.getText().length(), headline, sectionNumber);
Entity entity = new Entity(value.getTextBlock()
.getText(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.getTextBlock()
.getText()
.length(), headline, sectionNumber);
entity.setRedaction(false);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(cellHeader);
entity.setTargetSequences(value.getSequences()); // Make sure no other cells with same content are highlighted
entity.setTargetSequences(value.getTextBlock().getSequences()); // Make sure no other cells with same content are highlighted
entities.add(entity);
}

View File

@ -18,6 +18,7 @@ import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
@ -53,26 +54,27 @@ public class EntityRedactionService {
for (Table table : tables) {
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, TextBlock> tabularData = new HashMap<>();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
for (Cell cell : row) {
if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
int cellStart = start;
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
tabularData.put(headerName, cell.getTextBlocks().get(0));
tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart));
});
start = start + cell.getTextBlocks().get(0).toString().length();
for (TextBlock textBlock : cell.getTextBlocks()) {
searchableRow.addAll(textBlock.getSequences());
}
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);
@ -142,7 +144,7 @@ public class EntityRedactionService {
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber) {
Set<Entity> found = new HashSet<>();
if (StringUtils.isEmpty(searchableText.toString()) && StringUtils.isEmpty(headline)) {
if (StringUtils.isEmpty(searchableText.toString())) {
return found;
}

View File

@ -85,8 +85,7 @@ public class SectionsBuilderService {
}).collect(Collectors.toList());
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRows()
.size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
@ -185,7 +184,7 @@ public class SectionsBuilderService {
private List<Cell> getRowWithNonHeaderCells(Table table) {
for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
boolean allNonHeader = true;
for (Cell cell : row) {

View File

@ -29,11 +29,13 @@ public class Table extends AbstractTextContainer {
@Setter
private String headline;
@Getter
private int rowCount;
private int unrotatedRowCount;
@Getter
private int colCount;
private int unrotatedColCount;
private int rowCount = -1;
private int colCount = -1;
private final int rotation;
@ -65,6 +67,25 @@ public class Table extends AbstractTextContainer {
}
public int getRowCount() {
if (rowCount == -1) {
rowCount = getRows().size();
}
return rowCount;
}
public int getColCount() {
if (colCount == -1) {
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
}
return colCount;
}
/**
* Detect header cells (either first row or first column):
* Column is marked as header if cell text is bold and row cell text is not bold.
@ -72,100 +93,50 @@ public class Table extends AbstractTextContainer {
*/
private void computeHeaders() {
if (rows == null) {
rows = computeRows();
}
// A bold cell is a header cell as long as every cell to the left/top is bold, too
cells.forEach((position, cell) -> {
List<Cell> cellsToTheLeft = getCellsToTheLeft(position);
Cell lastHeaderCell = null;
for (Cell leftCell : cellsToTheLeft) {
if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks()
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<Cell> rowCells = rows.get(rowIndex);
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
Cell cell = rowCells.get(colIndex);
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
Cell lastHeaderCell = null;
for (Cell leftCell : cellsToTheLeft) {
if (leftCell.isHeaderCell()) {
lastHeaderCell = leftCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = rowIndex - 1; i >= 0; i--) {
cellsToTheTop.add(rows.get(i).get(colIndex));
}
for (Cell topCell : cellsToTheTop) {
if (topCell.isHeaderCell()) {
lastHeaderCell = topCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
.get(0)
.getMostPopularWordStyle()
.equals("bold")) {
lastHeaderCell = leftCell;
} else {
break;
cell.setHeaderCell(true);
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
lastHeaderCell = null;
List<Cell> cellsToTheTop = getCellToTheTop(position);
for (Cell topCell : cellsToTheTop) {
if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks()
.get(0)
.getMostPopularWordStyle()
.equals("bold")) {
lastHeaderCell = topCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
.get(0)
.getMostPopularWordStyle()
.equals("bold")) {
cell.setHeaderCell(true);
}
});
}
private List<Cell> getCellsToTheLeft(CellPosition cellPosition) {
List<Cell> result = new ArrayList<>();
if (cellPosition.getCol() == 0) {
return result;
}
int row = cellPosition.getRow();
for (int i = cellPosition.getCol() - 1; i >= 0; i--) {
if (cells.get(new CellPosition(row, i)) != null) {
result.add(cells.get(new CellPosition(row, i)));
} else {
Cell spanningCell = null;
while (spanningCell == null && row >= 0) {
row--;
spanningCell = cells.get(new CellPosition(row, i));
}
if (spanningCell != null) {
result.add(spanningCell);
}
row = cellPosition.getRow();
}
}
Collections.reverse(result);
return result;
}
private List<Cell> getCellToTheTop(CellPosition cellPosition) {
List<Cell> result = new ArrayList<>();
if (cellPosition.getRow() == 0) {
return result;
}
int col = cellPosition.getCol();
for (int i = cellPosition.getRow() - 1; i >= 0; i--) {
if (cells.get(new CellPosition(i, col)) != null) {
result.add(cells.get(new CellPosition(i, col)));
} else {
Cell spanningCell = null;
while (spanningCell == null && col >= 0) {
col--;
spanningCell = cells.get(new CellPosition(i, col));
}
if (spanningCell != null) {
result.add(spanningCell);
}
col = cellPosition.getCol();
}
}
Collections.reverse(result);
return result;
}
@ -173,9 +144,9 @@ public class Table extends AbstractTextContainer {
List<List<Cell>> rows = new ArrayList<>();
if (rotation == 90) {
for (int i = 0; i < colCount; i++) { // rows
for (int i = 0; i < unrotatedColCount; i++) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = rowCount - 1; j >= 0; j--) { // cols
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
Cell cell = cells.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
@ -184,9 +155,9 @@ public class Table extends AbstractTextContainer {
rows.add(lastRow);
}
} else if (rotation == 270) {
for (int i = colCount - 1; i >= 0; i--) { // rows
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < rowCount; j++) { // cols
for (int j = 0; j < unrotatedRowCount; j++) { // cols
Cell cell = cells.get(new CellPosition(i, j));
if (cell != null) {
lastRow.add(cell);
@ -195,9 +166,9 @@ public class Table extends AbstractTextContainer {
rows.add(lastRow);
}
} else {
for (int i = 0; i < rowCount; i++) {
for (int i = 0; i < unrotatedRowCount; i++) {
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < colCount; j++) {
for (int j = 0; j < unrotatedColCount; j++) {
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
if (cell != null) {
lastRow.add(cell);
@ -214,8 +185,8 @@ public class Table extends AbstractTextContainer {
private void add(Cell chunk, int row, int col) {
rowCount = Math.max(rowCount, row + 1);
colCount = Math.max(colCount, col + 1);
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
CellPosition cp = new CellPosition(row, col);
cells.put(cp, chunk);

View File

@ -146,4 +146,44 @@ public class PdfSegmentationServiceTest {
}
}
@Test
public void testHeaderCellsForRotatedTable() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows().stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells)))
.isTrue();
}
}
}