Fix entity span in table rows and detection of headers in rotated tables
This commit is contained in:
parent
6483e637c6
commit
848c506c3f
@ -0,0 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
|
||||
@Value
|
||||
@RequiredArgsConstructor
|
||||
public class CellValue {
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
int rowSpanStart;
|
||||
|
||||
}
|
||||
@ -32,7 +32,7 @@ public class Section {
|
||||
|
||||
private int sectionNumber;
|
||||
|
||||
private Map<String, TextBlock> tabularData;
|
||||
private Map<String, CellValue> tabularData;
|
||||
|
||||
|
||||
public boolean rowEquals(String headerName, String value){
|
||||
@ -40,7 +40,8 @@ public class Section {
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
|
||||
return tabularData != null && tabularData.containsKey(cleanHeaderName) && tabularData.get(cleanHeaderName).getText().equals(value);
|
||||
return tabularData != null && tabularData.containsKey(cleanHeaderName)
|
||||
&& tabularData.get(cleanHeaderName).getTextBlock().getText().equals(value);
|
||||
}
|
||||
|
||||
|
||||
@ -177,15 +178,18 @@ public class Section {
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
|
||||
TextBlock value = tabularData.get(cleanHeaderName);
|
||||
CellValue value = tabularData.get(cleanHeaderName);
|
||||
if (value == null) {
|
||||
log.warn("Could not find any data for {}.", cellHeader);
|
||||
} else {
|
||||
Entity entity = new Entity(value.getText(), type, 0, value.getText().length(), headline, sectionNumber);
|
||||
Entity entity = new Entity(value.getTextBlock()
|
||||
.getText(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.getTextBlock()
|
||||
.getText()
|
||||
.length(), headline, sectionNumber);
|
||||
entity.setRedaction(false);
|
||||
entity.setMatchedRule(ruleNumber);
|
||||
entity.setRedactionReason(cellHeader);
|
||||
entity.setTargetSequences(value.getSequences()); // Make sure no other cells with same content are highlighted
|
||||
entity.setTargetSequences(value.getTextBlock().getSequences()); // Make sure no other cells with same content are highlighted
|
||||
entities.add(entity);
|
||||
}
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
@ -53,26 +54,27 @@ public class EntityRedactionService {
|
||||
for (Table table : tables) {
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
SearchableText searchableRow = new SearchableText();
|
||||
Map<String, TextBlock> tabularData = new HashMap<>();
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
int start = 0;
|
||||
for (Cell cell : row) {
|
||||
if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
|
||||
continue;
|
||||
}
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
|
||||
int cellStart = start;
|
||||
cell.getHeaderCells().forEach(headerCell -> {
|
||||
|
||||
StringBuilder headerBuilder = new StringBuilder();
|
||||
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
|
||||
String headerName = headerBuilder.toString()
|
||||
.replaceAll("\n", "")
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
tabularData.put(headerName, cell.getTextBlocks().get(0));
|
||||
tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart));
|
||||
});
|
||||
start = start + cell.getTextBlocks().get(0).toString().length();
|
||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
searchableRow.addAll(textBlock.getSequences());
|
||||
}
|
||||
|
||||
}
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);
|
||||
|
||||
@ -142,7 +144,7 @@ public class EntityRedactionService {
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber) {
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
if (StringUtils.isEmpty(searchableText.toString()) && StringUtils.isEmpty(headline)) {
|
||||
if (StringUtils.isEmpty(searchableText.toString())) {
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
@ -85,8 +85,7 @@ public class SectionsBuilderService {
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRows()
|
||||
.size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
@ -185,7 +184,7 @@ public class SectionsBuilderService {
|
||||
|
||||
private List<Cell> getRowWithNonHeaderCells(Table table) {
|
||||
|
||||
for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
boolean allNonHeader = true;
|
||||
for (Cell cell : row) {
|
||||
|
||||
@ -29,11 +29,13 @@ public class Table extends AbstractTextContainer {
|
||||
@Setter
|
||||
private String headline;
|
||||
|
||||
@Getter
|
||||
private int rowCount;
|
||||
private int unrotatedRowCount;
|
||||
|
||||
@Getter
|
||||
private int colCount;
|
||||
private int unrotatedColCount;
|
||||
|
||||
private int rowCount = -1;
|
||||
|
||||
private int colCount = -1;
|
||||
|
||||
private final int rotation;
|
||||
|
||||
@ -65,6 +67,25 @@ public class Table extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
public int getRowCount() {
|
||||
|
||||
if (rowCount == -1) {
|
||||
rowCount = getRows().size();
|
||||
}
|
||||
return rowCount;
|
||||
}
|
||||
|
||||
|
||||
public int getColCount() {
|
||||
|
||||
if (colCount == -1) {
|
||||
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
}
|
||||
return colCount;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if cell text is bold and row cell text is not bold.
|
||||
@ -72,100 +93,50 @@ public class Table extends AbstractTextContainer {
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
}
|
||||
// A bold cell is a header cell as long as every cell to the left/top is bold, too
|
||||
cells.forEach((position, cell) -> {
|
||||
List<Cell> cellsToTheLeft = getCellsToTheLeft(position);
|
||||
Cell lastHeaderCell = null;
|
||||
for (Cell leftCell : cellsToTheLeft) {
|
||||
if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks()
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
Cell cell = rowCells.get(colIndex);
|
||||
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
Cell lastHeaderCell = null;
|
||||
for (Cell leftCell : cellsToTheLeft) {
|
||||
if (leftCell.isHeaderCell()) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = rowIndex - 1; i >= 0; i--) {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
}
|
||||
for (Cell topCell : cellsToTheTop) {
|
||||
if (topCell.isHeaderCell()) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
|
||||
.get(0)
|
||||
.getMostPopularWordStyle()
|
||||
.equals("bold")) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
lastHeaderCell = null;
|
||||
List<Cell> cellsToTheTop = getCellToTheTop(position);
|
||||
for (Cell topCell : cellsToTheTop) {
|
||||
if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks()
|
||||
.get(0)
|
||||
.getMostPopularWordStyle()
|
||||
.equals("bold")) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
|
||||
.get(0)
|
||||
.getMostPopularWordStyle()
|
||||
.equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getCellsToTheLeft(CellPosition cellPosition) {
|
||||
|
||||
List<Cell> result = new ArrayList<>();
|
||||
if (cellPosition.getCol() == 0) {
|
||||
return result;
|
||||
}
|
||||
int row = cellPosition.getRow();
|
||||
for (int i = cellPosition.getCol() - 1; i >= 0; i--) {
|
||||
if (cells.get(new CellPosition(row, i)) != null) {
|
||||
result.add(cells.get(new CellPosition(row, i)));
|
||||
} else {
|
||||
Cell spanningCell = null;
|
||||
while (spanningCell == null && row >= 0) {
|
||||
row--;
|
||||
spanningCell = cells.get(new CellPosition(row, i));
|
||||
}
|
||||
if (spanningCell != null) {
|
||||
result.add(spanningCell);
|
||||
}
|
||||
row = cellPosition.getRow();
|
||||
}
|
||||
}
|
||||
Collections.reverse(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getCellToTheTop(CellPosition cellPosition) {
|
||||
|
||||
List<Cell> result = new ArrayList<>();
|
||||
if (cellPosition.getRow() == 0) {
|
||||
return result;
|
||||
}
|
||||
int col = cellPosition.getCol();
|
||||
for (int i = cellPosition.getRow() - 1; i >= 0; i--) {
|
||||
if (cells.get(new CellPosition(i, col)) != null) {
|
||||
result.add(cells.get(new CellPosition(i, col)));
|
||||
} else {
|
||||
Cell spanningCell = null;
|
||||
while (spanningCell == null && col >= 0) {
|
||||
col--;
|
||||
spanningCell = cells.get(new CellPosition(i, col));
|
||||
}
|
||||
if (spanningCell != null) {
|
||||
result.add(spanningCell);
|
||||
}
|
||||
col = cellPosition.getCol();
|
||||
}
|
||||
}
|
||||
Collections.reverse(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@ -173,9 +144,9 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
List<List<Cell>> rows = new ArrayList<>();
|
||||
if (rotation == 90) {
|
||||
for (int i = 0; i < colCount; i++) { // rows
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = rowCount - 1; j >= 0; j--) { // cols
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
@ -184,9 +155,9 @@ public class Table extends AbstractTextContainer {
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else if (rotation == 270) {
|
||||
for (int i = colCount - 1; i >= 0; i--) { // rows
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < rowCount; j++) { // cols
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
Cell cell = cells.get(new CellPosition(i, j));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
@ -195,9 +166,9 @@ public class Table extends AbstractTextContainer {
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < rowCount; i++) {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < colCount; j++) {
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
@ -214,8 +185,8 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
private void add(Cell chunk, int row, int col) {
|
||||
|
||||
rowCount = Math.max(rowCount, row + 1);
|
||||
colCount = Math.max(colCount, col + 1);
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cells.put(cp, chunk);
|
||||
|
||||
@ -146,4 +146,44 @@ public class PdfSegmentationServiceTest {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testHeaderCellsForRotatedTable() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells)))
|
||||
.isTrue();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user