Pull request #33: Fix entity span in table rows and detection of headers in rotated tables

Merge in RED/redaction-service from bugfix/rowspan-and-header-in-rotated-table-fix to master

* commit '4954aafed78e06484531d6264bdf215176ee0ef2':
  Reduce log level.
  Remove unused import
  Adjust test to added rule and fix vertical header propagation for row > 2
  Fix entity span in table rows and detection of headers in rotated tables
This commit is contained in:
Thierry Goeckel 2020-08-25 17:30:57 +02:00
commit 81ea2e91ef
9 changed files with 183 additions and 115 deletions

View File

@ -0,0 +1,16 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import lombok.RequiredArgsConstructor;
import lombok.Value;
@Value
@RequiredArgsConstructor
public class CellValue {
TextBlock textBlock;
int rowSpanStart;
}

View File

@ -9,8 +9,6 @@ import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@ -32,7 +30,7 @@ public class Section {
private int sectionNumber;
private Map<String, TextBlock> tabularData;
private Map<String, CellValue> tabularData;
public boolean rowEquals(String headerName, String value){
@ -40,7 +38,8 @@ public class Section {
.replaceAll(" ", "")
.replaceAll("-", "");
return tabularData != null && tabularData.containsKey(cleanHeaderName) && tabularData.get(cleanHeaderName).getText().equals(value);
return tabularData != null && tabularData.containsKey(cleanHeaderName)
&& tabularData.get(cleanHeaderName).getTextBlock().getText().equals(value);
}
@ -177,15 +176,18 @@ public class Section {
.replaceAll(" ", "")
.replaceAll("-", "");
TextBlock value = tabularData.get(cleanHeaderName);
CellValue value = tabularData.get(cleanHeaderName);
if (value == null) {
log.warn("Could not find any data for {}.", cellHeader);
} else {
Entity entity = new Entity(value.getText(), type, 0, value.getText().length(), headline, sectionNumber);
Entity entity = new Entity(value.getTextBlock()
.getText(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.getTextBlock()
.getText()
.length(), headline, sectionNumber);
entity.setRedaction(false);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(cellHeader);
entity.setTargetSequences(value.getSequences()); // Make sure no other cells with same content are highlighted
entity.setTargetSequences(value.getTextBlock().getSequences()); // Make sure no other cells with same content are highlighted
entities.add(entity);
}

View File

@ -18,6 +18,7 @@ import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
@ -53,26 +54,27 @@ public class EntityRedactionService {
for (Table table : tables) {
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, TextBlock> tabularData = new HashMap<>();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
for (Cell cell : row) {
if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
int cellStart = start;
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
tabularData.put(headerName, cell.getTextBlocks().get(0));
tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart));
});
start = start + cell.getTextBlocks().get(0).toString().length();
for (TextBlock textBlock : cell.getTextBlocks()) {
searchableRow.addAll(textBlock.getSequences());
}
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);
@ -142,7 +144,7 @@ public class EntityRedactionService {
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber) {
Set<Entity> found = new HashSet<>();
if (StringUtils.isEmpty(searchableText.toString()) && StringUtils.isEmpty(headline)) {
if (StringUtils.isEmpty(searchableText.toString())) {
return found;
}

View File

@ -85,8 +85,7 @@ public class SectionsBuilderService {
}).collect(Collectors.toList());
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRows()
.size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
@ -185,7 +184,7 @@ public class SectionsBuilderService {
private List<Cell> getRowWithNonHeaderCells(Table table) {
for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
boolean allNonHeader = true;
for (Cell cell : row) {

View File

@ -29,11 +29,13 @@ public class Table extends AbstractTextContainer {
@Setter
private String headline;
@Getter
private int rowCount;
private int unrotatedRowCount;
@Getter
private int colCount;
private int unrotatedColCount;
private int rowCount = -1;
private int colCount = -1;
private final int rotation;
@ -65,6 +67,25 @@ public class Table extends AbstractTextContainer {
}
public int getRowCount() {
if (rowCount == -1) {
rowCount = getRows().size();
}
return rowCount;
}
public int getColCount() {
if (colCount == -1) {
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
}
return colCount;
}
/**
* Detect header cells (either first row or first column):
* Column is marked as header if cell text is bold and row cell text is not bold.
@ -72,100 +93,54 @@ public class Table extends AbstractTextContainer {
*/
private void computeHeaders() {
if (rows == null) {
rows = computeRows();
}
// A bold cell is a header cell as long as every cell to the left/top is bold, too
cells.forEach((position, cell) -> {
List<Cell> cellsToTheLeft = getCellsToTheLeft(position);
Cell lastHeaderCell = null;
for (Cell leftCell : cellsToTheLeft) {
if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks()
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<Cell> rowCells = rows.get(rowIndex);
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
Cell cell = rowCells.get(colIndex);
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
Cell lastHeaderCell = null;
for (Cell leftCell : cellsToTheLeft) {
if (leftCell.isHeaderCell()) {
lastHeaderCell = leftCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i).get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
}
for (Cell topCell : cellsToTheTop) {
if (topCell.isHeaderCell()) {
lastHeaderCell = topCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
.get(0)
.getMostPopularWordStyle()
.equals("bold")) {
lastHeaderCell = leftCell;
} else {
break;
cell.setHeaderCell(true);
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
lastHeaderCell = null;
List<Cell> cellsToTheTop = getCellToTheTop(position);
for (Cell topCell : cellsToTheTop) {
if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks()
.get(0)
.getMostPopularWordStyle()
.equals("bold")) {
lastHeaderCell = topCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
.get(0)
.getMostPopularWordStyle()
.equals("bold")) {
cell.setHeaderCell(true);
}
});
}
private List<Cell> getCellsToTheLeft(CellPosition cellPosition) {
List<Cell> result = new ArrayList<>();
if (cellPosition.getCol() == 0) {
return result;
}
int row = cellPosition.getRow();
for (int i = cellPosition.getCol() - 1; i >= 0; i--) {
if (cells.get(new CellPosition(row, i)) != null) {
result.add(cells.get(new CellPosition(row, i)));
} else {
Cell spanningCell = null;
while (spanningCell == null && row >= 0) {
row--;
spanningCell = cells.get(new CellPosition(row, i));
}
if (spanningCell != null) {
result.add(spanningCell);
}
row = cellPosition.getRow();
}
}
Collections.reverse(result);
return result;
}
private List<Cell> getCellToTheTop(CellPosition cellPosition) {
List<Cell> result = new ArrayList<>();
if (cellPosition.getRow() == 0) {
return result;
}
int col = cellPosition.getCol();
for (int i = cellPosition.getRow() - 1; i >= 0; i--) {
if (cells.get(new CellPosition(i, col)) != null) {
result.add(cells.get(new CellPosition(i, col)));
} else {
Cell spanningCell = null;
while (spanningCell == null && col >= 0) {
col--;
spanningCell = cells.get(new CellPosition(i, col));
}
if (spanningCell != null) {
result.add(spanningCell);
}
col = cellPosition.getCol();
}
}
Collections.reverse(result);
return result;
}
@ -173,9 +148,9 @@ public class Table extends AbstractTextContainer {
List<List<Cell>> rows = new ArrayList<>();
if (rotation == 90) {
for (int i = 0; i < colCount; i++) { // rows
for (int i = 0; i < unrotatedColCount; i++) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = rowCount - 1; j >= 0; j--) { // cols
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
Cell cell = cells.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
@ -184,9 +159,9 @@ public class Table extends AbstractTextContainer {
rows.add(lastRow);
}
} else if (rotation == 270) {
for (int i = colCount - 1; i >= 0; i--) { // rows
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < rowCount; j++) { // cols
for (int j = 0; j < unrotatedRowCount; j++) { // cols
Cell cell = cells.get(new CellPosition(i, j));
if (cell != null) {
lastRow.add(cell);
@ -195,9 +170,9 @@ public class Table extends AbstractTextContainer {
rows.add(lastRow);
}
} else {
for (int i = 0; i < rowCount; i++) {
for (int i = 0; i < unrotatedRowCount; i++) {
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < colCount; j++) {
for (int j = 0; j < unrotatedColCount; j++) {
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
if (cell != null) {
lastRow.add(cell);
@ -214,8 +189,8 @@ public class Table extends AbstractTextContainer {
private void add(Cell chunk, int row, int col) {
rowCount = Math.max(rowCount, row + 1);
colCount = Math.max(colCount, col + 1);
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
CellPosition cp = new CellPosition(row, col);
cells.put(cp, chunk);

View File

@ -130,7 +130,7 @@ public class EntityRedactionServiceTest {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 names, 1 address, 1 Y and 2 N entities
}
}
@ -193,6 +193,7 @@ public class EntityRedactionServiceTest {
}
@Test
public void headerPropagation() throws IOException {
@ -219,6 +220,31 @@ public class EntityRedactionServiceTest {
}
@Test
public void testNGuideline() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Empty Tabular Data.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Aldershof S."))
.build();
when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
}
}
@Before
public void stubRedaction() {
String tableRules = "package drools\n" +
@ -226,12 +252,20 @@ public class EntityRedactionServiceTest {
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
"\n" +
"global Section section\n" +
"rule \"8: Not redacted because Vertebrate Study = N\"\n" +
" when\n" +
" Section(rowEquals(\"Vertebrate study Y/N\", \"N\"))\n" +
" then\n" +
" section.redactNot(\"name\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
" section.redactNot(\"address\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 8, \"hint_only\");\n" +
" end\n" +
"rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" +
" when\n" +
" Section(rowEquals(\"Vertebrate study Y/N\", \"Y\"))\n" +
" then\n" +
" section.redact(\"name\", 9, \"Redacted because row is a vertebrate study\");\n" +
" section.redact(\"address\", 9, \"Redacted because rows is a vertebrate study\");\n" +
" section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" +
" end";
when(rulesClient.getVersion()).thenReturn(1L);

View File

@ -146,4 +146,44 @@ public class PdfSegmentationServiceTest {
}
}
@Test
public void testHeaderCellsForRotatedTable() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows().stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells)))
.isTrue();
}
}
}