Pull request #117: Fixed several table extraction problems

Merge in RED/redaction-service from tableExtractionImprovement to master

* commit 'a101b98a400b2a35635f2c5e0e894ac849288c3b':
  Fixed several table extraction problems
This commit is contained in:
Dominique Eiflaender 2021-02-04 15:21:12 +01:00
commit 154e09b843
8 changed files with 59 additions and 27 deletions

View File

@ -54,7 +54,16 @@ public class PDFLinesTextStripper extends PDFTextStripper {
protected PDPage pdpage;
@Getter
private int minCharWidths;
private int minCharWidth;
@Getter
private int maxCharWidth;
@Getter
private int minCharHeight;
@Getter
private int maxCharHeight;
@Getter
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@ -280,8 +289,19 @@ public class PDFLinesTextStripper extends PDFTextStripper {
for (int i = 0; i <= textPositions.size() - 1; i++) {
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
if (charWidth < minCharWidths) {
minCharWidths = charWidth;
if (charWidth < minCharWidth) {
minCharWidth = charWidth;
}
if (charWidth > maxCharWidth) {
maxCharWidth = charWidth;
}
int charHeight = (int) textPositions.get(i).getHeightDir();
if (charHeight < minCharHeight) {
minCharHeight = charHeight;
}
if (charWidth > maxCharHeight) {
maxCharHeight = charHeight;
}
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
@ -333,7 +353,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
@Override
public String getText(PDDocument doc) throws IOException {
minCharWidths = Integer.MAX_VALUE;
minCharWidth = Integer.MAX_VALUE;
maxCharWidth = 0;
minCharHeight = Integer.MAX_VALUE;
maxCharHeight = 0;
textPositionSequences.clear();
imageBounds = new ArrayList<>();
rulings.clear();

View File

@ -20,4 +20,5 @@ public class ParsedElements {
private boolean rotated;
private float minCharWidth;
private float maxCharWidth;
}

View File

@ -157,7 +157,6 @@ public class EntityRedactionService {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
int rowNumber = 0;
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
@ -170,11 +169,14 @@ public class EntityRedactionService {
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
int cellStart = start;
if(rowNumber != 0) {
if (!cell.isHeaderCell()) {
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString().replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
}
@ -205,8 +207,8 @@ public class EntityRedactionService {
.build(), searchableRow));
sectionNumber.incrementAndGet();
rowNumber++;
}
return sectionSearchableTextPairs;
}

View File

@ -61,13 +61,13 @@ public class PdfSegmentationService {
.rulings(stripper.getRulings())
.sequences(stripper.getTextPositionSequences())
.imageBounds(stripper.getImageBounds())
.minCharWidth(stripper.getMinCharWidths())
.minCharWidth(stripper.getMinCharWidth())
.maxCharWidth(stripper.getMaxCharWidth())
.landscape(isLandscape)
.rotated(isRotated)
.build();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements
.getMinCharWidth());
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), stripper.getMinCharWidth(), stripper.getMaxCharHeight());
Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());

View File

@ -188,6 +188,9 @@ public class SectionsBuilderService {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
if(row.size() == 1){
continue;
}
boolean allNonHeader = true;
for (Cell cell : row) {
if (cell.isHeaderCell()) {

View File

@ -110,6 +110,10 @@ public class Table extends AbstractTextContainer {
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<Cell> rowCells = rows.get(rowIndex);
if(rowCells.size() == 1){
continue;
}
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
Cell cell = rowCells.get(colIndex);
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);

View File

@ -18,9 +18,10 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
@Service
public class RulingCleaningService {
public CleanRulings getCleanRulings(List<Ruling> rulings, float minCharWidth){
public CleanRulings getCleanRulings(List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
if (!rulings.isEmpty()) {
snapPoints(rulings, minCharWidth , minCharWidth);
snapPoints(rulings, minCharWidth, maxCharHeight);
}
List<Ruling> vrs = new ArrayList<>();
@ -39,13 +40,10 @@ public class RulingCleaningService {
}
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
return CleanRulings
.builder()
.vertical(verticalRulingLines)
.horizontal(horizontalRulingLines)
.build();
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
}
public void snapPoints(List<? extends Line2D.Float> rulings, float xThreshold, float yThreshold) {
// collect points and keep a Line -> p1,p2 map
@ -122,12 +120,14 @@ public class RulingCleaningService {
private List<Ruling> collapseOrientedRulings(List<Ruling> lines) {
int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
}
private List<Ruling> collapseOrientedRulings(List<Ruling> lines, int expandAmount) {
ArrayList<Ruling> rv = new ArrayList<>();
lines.sort((a, b) -> {
final float diff = a.getPosition() - b.getPosition();
@ -141,25 +141,24 @@ public class RulingCleaningService {
final float lastStart = last.getStart();
final float lastEnd = last.getEnd();
final boolean lastFlipped = lastStart > lastEnd;
final boolean lastFlipped = lastStart > lastEnd;
final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
boolean differentDirections = nextFlipped != lastFlipped;
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
last.setStartEnd(newStart, newEnd);
assert !last.oblique();
}
else if (next_line.length() == 0) {
} else if (next_line.length() == 0) {
continue;
}
else {
} else {
rv.add(next_line);
}
}
return rv;
}
}

View File

@ -441,7 +441,7 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
@ -590,7 +590,7 @@ public class RedactionIntegrationTest {
public void htmlTablesTest() throws IOException {
System.out.println("htmlTablesTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))