Pull request #33: Fix entity span in table rows and detection of headers in rotated tables
Merge in RED/redaction-service from bugfix/rowspan-and-header-in-rotated-table-fix to master * commit '4954aafed78e06484531d6264bdf215176ee0ef2': Reduce log level. Remove unused import Adjust test to added rule and fix vertical header propagation for row > 2 Fix entity span in table rows and detection of headers in rotated tables
This commit is contained in:
commit
81ea2e91ef
@ -0,0 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
|
||||
@Value
|
||||
@RequiredArgsConstructor
|
||||
public class CellValue {
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
int rowSpanStart;
|
||||
|
||||
}
|
||||
@ -9,8 +9,6 @@ import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -32,7 +30,7 @@ public class Section {
|
||||
|
||||
private int sectionNumber;
|
||||
|
||||
private Map<String, TextBlock> tabularData;
|
||||
private Map<String, CellValue> tabularData;
|
||||
|
||||
|
||||
public boolean rowEquals(String headerName, String value){
|
||||
@ -40,7 +38,8 @@ public class Section {
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
|
||||
return tabularData != null && tabularData.containsKey(cleanHeaderName) && tabularData.get(cleanHeaderName).getText().equals(value);
|
||||
return tabularData != null && tabularData.containsKey(cleanHeaderName)
|
||||
&& tabularData.get(cleanHeaderName).getTextBlock().getText().equals(value);
|
||||
}
|
||||
|
||||
|
||||
@ -177,15 +176,18 @@ public class Section {
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
|
||||
TextBlock value = tabularData.get(cleanHeaderName);
|
||||
CellValue value = tabularData.get(cleanHeaderName);
|
||||
if (value == null) {
|
||||
log.warn("Could not find any data for {}.", cellHeader);
|
||||
} else {
|
||||
Entity entity = new Entity(value.getText(), type, 0, value.getText().length(), headline, sectionNumber);
|
||||
Entity entity = new Entity(value.getTextBlock()
|
||||
.getText(), type, value.getRowSpanStart(), value.getRowSpanStart() + value.getTextBlock()
|
||||
.getText()
|
||||
.length(), headline, sectionNumber);
|
||||
entity.setRedaction(false);
|
||||
entity.setMatchedRule(ruleNumber);
|
||||
entity.setRedactionReason(cellHeader);
|
||||
entity.setTargetSequences(value.getSequences()); // Make sure no other cells with same content are highlighted
|
||||
entity.setTargetSequences(value.getTextBlock().getSequences()); // Make sure no other cells with same content are highlighted
|
||||
entities.add(entity);
|
||||
}
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
@ -53,26 +54,27 @@ public class EntityRedactionService {
|
||||
for (Table table : tables) {
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
SearchableText searchableRow = new SearchableText();
|
||||
Map<String, TextBlock> tabularData = new HashMap<>();
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
int start = 0;
|
||||
for (Cell cell : row) {
|
||||
if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
|
||||
continue;
|
||||
}
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
|
||||
int cellStart = start;
|
||||
cell.getHeaderCells().forEach(headerCell -> {
|
||||
|
||||
StringBuilder headerBuilder = new StringBuilder();
|
||||
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
|
||||
String headerName = headerBuilder.toString()
|
||||
.replaceAll("\n", "")
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
tabularData.put(headerName, cell.getTextBlocks().get(0));
|
||||
tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart));
|
||||
});
|
||||
start = start + cell.getTextBlocks().get(0).toString().length();
|
||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
searchableRow.addAll(textBlock.getSequences());
|
||||
}
|
||||
|
||||
}
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);
|
||||
|
||||
@ -142,7 +144,7 @@ public class EntityRedactionService {
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber) {
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
if (StringUtils.isEmpty(searchableText.toString()) && StringUtils.isEmpty(headline)) {
|
||||
if (StringUtils.isEmpty(searchableText.toString())) {
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
@ -85,8 +85,7 @@ public class SectionsBuilderService {
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRows()
|
||||
.size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
@ -185,7 +184,7 @@ public class SectionsBuilderService {
|
||||
|
||||
private List<Cell> getRowWithNonHeaderCells(Table table) {
|
||||
|
||||
for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
boolean allNonHeader = true;
|
||||
for (Cell cell : row) {
|
||||
|
||||
@ -29,11 +29,13 @@ public class Table extends AbstractTextContainer {
|
||||
@Setter
|
||||
private String headline;
|
||||
|
||||
@Getter
|
||||
private int rowCount;
|
||||
private int unrotatedRowCount;
|
||||
|
||||
@Getter
|
||||
private int colCount;
|
||||
private int unrotatedColCount;
|
||||
|
||||
private int rowCount = -1;
|
||||
|
||||
private int colCount = -1;
|
||||
|
||||
private final int rotation;
|
||||
|
||||
@ -65,6 +67,25 @@ public class Table extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
public int getRowCount() {
|
||||
|
||||
if (rowCount == -1) {
|
||||
rowCount = getRows().size();
|
||||
}
|
||||
return rowCount;
|
||||
}
|
||||
|
||||
|
||||
public int getColCount() {
|
||||
|
||||
if (colCount == -1) {
|
||||
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
}
|
||||
return colCount;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if cell text is bold and row cell text is not bold.
|
||||
@ -72,100 +93,54 @@ public class Table extends AbstractTextContainer {
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
}
|
||||
// A bold cell is a header cell as long as every cell to the left/top is bold, too
|
||||
cells.forEach((position, cell) -> {
|
||||
List<Cell> cellsToTheLeft = getCellsToTheLeft(position);
|
||||
Cell lastHeaderCell = null;
|
||||
for (Cell leftCell : cellsToTheLeft) {
|
||||
if (CollectionUtils.isNotEmpty(leftCell.getTextBlocks()) && leftCell.getTextBlocks()
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
Cell cell = rowCells.get(colIndex);
|
||||
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
Cell lastHeaderCell = null;
|
||||
for (Cell leftCell : cellsToTheLeft) {
|
||||
if (leftCell.isHeaderCell()) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
}
|
||||
for (Cell topCell : cellsToTheTop) {
|
||||
if (topCell.isHeaderCell()) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
|
||||
.get(0)
|
||||
.getMostPopularWordStyle()
|
||||
.equals("bold")) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
lastHeaderCell = null;
|
||||
List<Cell> cellsToTheTop = getCellToTheTop(position);
|
||||
for (Cell topCell : cellsToTheTop) {
|
||||
if (CollectionUtils.isNotEmpty(topCell.getTextBlocks()) && topCell.getTextBlocks()
|
||||
.get(0)
|
||||
.getMostPopularWordStyle()
|
||||
.equals("bold")) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (CollectionUtils.isNotEmpty(cell.getTextBlocks()) && cell.getTextBlocks()
|
||||
.get(0)
|
||||
.getMostPopularWordStyle()
|
||||
.equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getCellsToTheLeft(CellPosition cellPosition) {
|
||||
|
||||
List<Cell> result = new ArrayList<>();
|
||||
if (cellPosition.getCol() == 0) {
|
||||
return result;
|
||||
}
|
||||
int row = cellPosition.getRow();
|
||||
for (int i = cellPosition.getCol() - 1; i >= 0; i--) {
|
||||
if (cells.get(new CellPosition(row, i)) != null) {
|
||||
result.add(cells.get(new CellPosition(row, i)));
|
||||
} else {
|
||||
Cell spanningCell = null;
|
||||
while (spanningCell == null && row >= 0) {
|
||||
row--;
|
||||
spanningCell = cells.get(new CellPosition(row, i));
|
||||
}
|
||||
if (spanningCell != null) {
|
||||
result.add(spanningCell);
|
||||
}
|
||||
row = cellPosition.getRow();
|
||||
}
|
||||
}
|
||||
Collections.reverse(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getCellToTheTop(CellPosition cellPosition) {
|
||||
|
||||
List<Cell> result = new ArrayList<>();
|
||||
if (cellPosition.getRow() == 0) {
|
||||
return result;
|
||||
}
|
||||
int col = cellPosition.getCol();
|
||||
for (int i = cellPosition.getRow() - 1; i >= 0; i--) {
|
||||
if (cells.get(new CellPosition(i, col)) != null) {
|
||||
result.add(cells.get(new CellPosition(i, col)));
|
||||
} else {
|
||||
Cell spanningCell = null;
|
||||
while (spanningCell == null && col >= 0) {
|
||||
col--;
|
||||
spanningCell = cells.get(new CellPosition(i, col));
|
||||
}
|
||||
if (spanningCell != null) {
|
||||
result.add(spanningCell);
|
||||
}
|
||||
col = cellPosition.getCol();
|
||||
}
|
||||
}
|
||||
Collections.reverse(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@ -173,9 +148,9 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
List<List<Cell>> rows = new ArrayList<>();
|
||||
if (rotation == 90) {
|
||||
for (int i = 0; i < colCount; i++) { // rows
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = rowCount - 1; j >= 0; j--) { // cols
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
@ -184,9 +159,9 @@ public class Table extends AbstractTextContainer {
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else if (rotation == 270) {
|
||||
for (int i = colCount - 1; i >= 0; i--) { // rows
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < rowCount; j++) { // cols
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
Cell cell = cells.get(new CellPosition(i, j));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
@ -195,9 +170,9 @@ public class Table extends AbstractTextContainer {
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < rowCount; i++) {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < colCount; j++) {
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
@ -214,8 +189,8 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
private void add(Cell chunk, int row, int col) {
|
||||
|
||||
rowCount = Math.max(rowCount, row + 1);
|
||||
colCount = Math.max(colCount, col + 1);
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cells.put(cp, chunk);
|
||||
|
||||
@ -130,7 +130,7 @@ public class EntityRedactionServiceTest {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 names, 1 address, 1 Y and 2 N entities
|
||||
}
|
||||
}
|
||||
|
||||
@ -193,6 +193,7 @@ public class EntityRedactionServiceTest {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void headerPropagation() throws IOException {
|
||||
|
||||
@ -219,6 +220,31 @@ public class EntityRedactionServiceTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testNGuideline() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Empty Tabular Data.pdf");
|
||||
|
||||
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.singletonList("Aldershof S."))
|
||||
.build();
|
||||
|
||||
when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
|
||||
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
|
||||
DictionaryResponse addressResponse = DictionaryResponse.builder()
|
||||
.entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Before
|
||||
public void stubRedaction() {
|
||||
String tableRules = "package drools\n" +
|
||||
@ -226,12 +252,20 @@ public class EntityRedactionServiceTest {
|
||||
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
|
||||
"\n" +
|
||||
"global Section section\n" +
|
||||
"rule \"8: Not redacted because Vertebrate Study = N\"\n" +
|
||||
" when\n" +
|
||||
" Section(rowEquals(\"Vertebrate study Y/N\", \"N\"))\n" +
|
||||
" then\n" +
|
||||
" section.redactNot(\"name\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
|
||||
" section.redactNot(\"address\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
|
||||
" section.highlightCell(\"Vertebrate study Y/N\", 8, \"hint_only\");\n" +
|
||||
" end\n" +
|
||||
"rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" +
|
||||
" when\n" +
|
||||
" Section(rowEquals(\"Vertebrate study Y/N\", \"Y\"))\n" +
|
||||
" then\n" +
|
||||
" section.redact(\"name\", 9, \"Redacted because row is a vertebrate study\");\n" +
|
||||
" section.redact(\"address\", 9, \"Redacted because rows is a vertebrate study\");\n" +
|
||||
" section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\");\n" +
|
||||
" section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" +
|
||||
" end";
|
||||
when(rulesClient.getVersion()).thenReturn(1L);
|
||||
|
||||
@ -146,4 +146,44 @@ public class PdfSegmentationServiceTest {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testHeaderCellsForRotatedTable() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells)))
|
||||
.isTrue();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user