Fixed table extraction problems

This commit is contained in:
Dominique Eifländer 2021-02-03 14:34:29 +01:00
parent 7898f6a30f
commit fc2ac03691
4 changed files with 9 additions and 20 deletions

View File

@ -54,10 +54,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
protected PDPage pdpage;
@Getter
private int maxCharWidths;
@Getter
private int maxCharHeight;
private int minCharWidths;
@Getter
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@ -282,14 +279,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
int startIndex = 0;
for (int i = 0; i <= textPositions.size() - 1; i++) {
int charHeight = (int) textPositions.get(i).getHeightDir();
if (charHeight > maxCharHeight) {
maxCharHeight = charHeight;
}
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
if (charWidth > maxCharWidths) {
maxCharWidths = charWidth;
if (charWidth < minCharWidths) {
minCharWidths = charWidth;
}
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
@ -341,8 +333,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
@Override
public String getText(PDDocument doc) throws IOException {
maxCharWidths = 0;
maxCharWidths = 0;
minCharWidths = Integer.MAX_VALUE;
textPositionSequences.clear();
imageBounds = new ArrayList<>();
rulings.clear();

View File

@ -19,6 +19,5 @@ public class ParsedElements {
private boolean landscape;
private boolean rotated;
private float maxCharWidth;
private float maxCharHeight;
private float minCharWidth;
}

View File

@ -61,14 +61,13 @@ public class PdfSegmentationService {
.rulings(stripper.getRulings())
.sequences(stripper.getTextPositionSequences())
.imageBounds(stripper.getImageBounds())
.maxCharWidth(stripper.getMaxCharWidths())
.maxCharHeight(stripper.getMaxCharWidths())
.minCharWidth(stripper.getMinCharWidths())
.landscape(isLandscape)
.rotated(isRotated)
.build();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements
.getMaxCharWidth(), parsedElements.getMaxCharHeight());
.getMinCharWidth());
Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());

View File

@ -18,9 +18,9 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
@Service
public class RulingCleaningService {
public CleanRulings getCleanRulings(List<Ruling> rulings, float maxCharWidth, float maxCharHeight){
public CleanRulings getCleanRulings(List<Ruling> rulings, float minCharWidth){
if (!rulings.isEmpty()) {
snapPoints(rulings, maxCharWidth , maxCharHeight);
snapPoints(rulings, minCharWidth , minCharWidth);
}
List<Ruling> vrs = new ArrayList<>();