Pull request #117: Fixed several table extraction problems
Merge in RED/redaction-service from tableExtractionImprovement to master * commit 'a101b98a400b2a35635f2c5e0e894ac849288c3b': Fixed several table extraction problems
This commit is contained in:
commit
154e09b843
@ -54,7 +54,16 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
protected PDPage pdpage;
|
||||
|
||||
@Getter
|
||||
private int minCharWidths;
|
||||
private int minCharWidth;
|
||||
|
||||
@Getter
|
||||
private int maxCharWidth;
|
||||
|
||||
@Getter
|
||||
private int minCharHeight;
|
||||
|
||||
@Getter
|
||||
private int maxCharHeight;
|
||||
|
||||
@Getter
|
||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
@ -280,8 +289,19 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
|
||||
if (charWidth < minCharWidths) {
|
||||
minCharWidths = charWidth;
|
||||
if (charWidth < minCharWidth) {
|
||||
minCharWidth = charWidth;
|
||||
}
|
||||
if (charWidth > maxCharWidth) {
|
||||
maxCharWidth = charWidth;
|
||||
}
|
||||
|
||||
int charHeight = (int) textPositions.get(i).getHeightDir();
|
||||
if (charHeight < minCharHeight) {
|
||||
minCharHeight = charHeight;
|
||||
}
|
||||
if (charWidth > maxCharHeight) {
|
||||
maxCharHeight = charHeight;
|
||||
}
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||
@ -333,7 +353,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
@Override
|
||||
public String getText(PDDocument doc) throws IOException {
|
||||
|
||||
minCharWidths = Integer.MAX_VALUE;
|
||||
minCharWidth = Integer.MAX_VALUE;
|
||||
maxCharWidth = 0;
|
||||
minCharHeight = Integer.MAX_VALUE;
|
||||
maxCharHeight = 0;
|
||||
textPositionSequences.clear();
|
||||
imageBounds = new ArrayList<>();
|
||||
rulings.clear();
|
||||
|
||||
@ -20,4 +20,5 @@ public class ParsedElements {
|
||||
private boolean rotated;
|
||||
|
||||
private float minCharWidth;
|
||||
private float maxCharWidth;
|
||||
}
|
||||
|
||||
@ -157,7 +157,6 @@ public class EntityRedactionService {
|
||||
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
|
||||
int rowNumber = 0;
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
SearchableText searchableRow = new SearchableText();
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
@ -170,11 +169,14 @@ public class EntityRedactionService {
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
|
||||
int cellStart = start;
|
||||
|
||||
if(rowNumber != 0) {
|
||||
if (!cell.isHeaderCell()) {
|
||||
cell.getHeaderCells().forEach(headerCell -> {
|
||||
StringBuilder headerBuilder = new StringBuilder();
|
||||
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
|
||||
String headerName = headerBuilder.toString().replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
|
||||
String headerName = headerBuilder.toString()
|
||||
.replaceAll("\n", "")
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
|
||||
});
|
||||
}
|
||||
@ -205,8 +207,8 @@ public class EntityRedactionService {
|
||||
.build(), searchableRow));
|
||||
|
||||
sectionNumber.incrementAndGet();
|
||||
rowNumber++;
|
||||
}
|
||||
|
||||
return sectionSearchableTextPairs;
|
||||
}
|
||||
|
||||
|
||||
@ -61,13 +61,13 @@ public class PdfSegmentationService {
|
||||
.rulings(stripper.getRulings())
|
||||
.sequences(stripper.getTextPositionSequences())
|
||||
.imageBounds(stripper.getImageBounds())
|
||||
.minCharWidth(stripper.getMinCharWidths())
|
||||
.minCharWidth(stripper.getMinCharWidth())
|
||||
.maxCharWidth(stripper.getMaxCharWidth())
|
||||
.landscape(isLandscape)
|
||||
.rotated(isRotated)
|
||||
.build();
|
||||
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements
|
||||
.getMinCharWidth());
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), stripper.getMinCharWidth(), stripper.getMaxCharHeight());
|
||||
|
||||
Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
|
||||
@ -188,6 +188,9 @@ public class SectionsBuilderService {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
if(row.size() == 1){
|
||||
continue;
|
||||
}
|
||||
boolean allNonHeader = true;
|
||||
for (Cell cell : row) {
|
||||
if (cell.isHeaderCell()) {
|
||||
|
||||
@ -110,6 +110,10 @@ public class Table extends AbstractTextContainer {
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
if(rowCells.size() == 1){
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
Cell cell = rowCells.get(colIndex);
|
||||
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
|
||||
@ -18,9 +18,10 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
@Service
|
||||
public class RulingCleaningService {
|
||||
|
||||
public CleanRulings getCleanRulings(List<Ruling> rulings, float minCharWidth){
|
||||
public CleanRulings getCleanRulings(List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
|
||||
|
||||
if (!rulings.isEmpty()) {
|
||||
snapPoints(rulings, minCharWidth , minCharWidth);
|
||||
snapPoints(rulings, minCharWidth, maxCharHeight);
|
||||
}
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
@ -39,13 +40,10 @@ public class RulingCleaningService {
|
||||
}
|
||||
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
|
||||
|
||||
return CleanRulings
|
||||
.builder()
|
||||
.vertical(verticalRulingLines)
|
||||
.horizontal(horizontalRulingLines)
|
||||
.build();
|
||||
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
|
||||
}
|
||||
|
||||
|
||||
public void snapPoints(List<? extends Line2D.Float> rulings, float xThreshold, float yThreshold) {
|
||||
|
||||
// collect points and keep a Line -> p1,p2 map
|
||||
@ -122,12 +120,14 @@ public class RulingCleaningService {
|
||||
|
||||
|
||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines) {
|
||||
|
||||
int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
|
||||
return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
|
||||
}
|
||||
|
||||
|
||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines, int expandAmount) {
|
||||
|
||||
ArrayList<Ruling> rv = new ArrayList<>();
|
||||
lines.sort((a, b) -> {
|
||||
final float diff = a.getPosition() - b.getPosition();
|
||||
@ -141,25 +141,24 @@ public class RulingCleaningService {
|
||||
final float lastStart = last.getStart();
|
||||
final float lastEnd = last.getEnd();
|
||||
|
||||
final boolean lastFlipped = lastStart > lastEnd;
|
||||
final boolean lastFlipped = lastStart > lastEnd;
|
||||
final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
|
||||
|
||||
boolean differentDirections = nextFlipped != lastFlipped;
|
||||
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
|
||||
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
|
||||
float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
|
||||
|
||||
final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
|
||||
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
|
||||
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
|
||||
last.setStartEnd(newStart, newEnd);
|
||||
assert !last.oblique();
|
||||
}
|
||||
else if (next_line.length() == 0) {
|
||||
} else if (next_line.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
rv.add(next_line);
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -441,7 +441,7 @@ public class RedactionIntegrationTest {
|
||||
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
@ -590,7 +590,7 @@ public class RedactionIntegrationTest {
|
||||
public void htmlTablesTest() throws IOException {
|
||||
|
||||
System.out.println("htmlTablesTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user