Pull request #116: Fixed table extraction problems
Merge in RED/redaction-service from TableExtractionProbemFix to master * commit 'fc2ac03691871a8a1f3c8a29756e3a60e63e16bf': Fixed table extraction problems
This commit is contained in:
commit
76ecfdabd3
@ -54,10 +54,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
protected PDPage pdpage;
|
||||
|
||||
@Getter
|
||||
private int maxCharWidths;
|
||||
|
||||
@Getter
|
||||
private int maxCharHeight;
|
||||
private int minCharWidths;
|
||||
|
||||
@Getter
|
||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
@ -282,14 +279,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
int startIndex = 0;
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
int charHeight = (int) textPositions.get(i).getHeightDir();
|
||||
if (charHeight > maxCharHeight) {
|
||||
maxCharHeight = charHeight;
|
||||
}
|
||||
|
||||
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
|
||||
if (charWidth > maxCharWidths) {
|
||||
maxCharWidths = charWidth;
|
||||
if (charWidth < minCharWidths) {
|
||||
minCharWidths = charWidth;
|
||||
}
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||
@ -341,8 +333,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
@Override
|
||||
public String getText(PDDocument doc) throws IOException {
|
||||
|
||||
maxCharWidths = 0;
|
||||
maxCharWidths = 0;
|
||||
minCharWidths = Integer.MAX_VALUE;
|
||||
textPositionSequences.clear();
|
||||
imageBounds = new ArrayList<>();
|
||||
rulings.clear();
|
||||
|
||||
@ -19,6 +19,5 @@ public class ParsedElements {
|
||||
private boolean landscape;
|
||||
private boolean rotated;
|
||||
|
||||
private float maxCharWidth;
|
||||
private float maxCharHeight;
|
||||
private float minCharWidth;
|
||||
}
|
||||
|
||||
@ -61,14 +61,13 @@ public class PdfSegmentationService {
|
||||
.rulings(stripper.getRulings())
|
||||
.sequences(stripper.getTextPositionSequences())
|
||||
.imageBounds(stripper.getImageBounds())
|
||||
.maxCharWidth(stripper.getMaxCharWidths())
|
||||
.maxCharHeight(stripper.getMaxCharWidths())
|
||||
.minCharWidth(stripper.getMinCharWidths())
|
||||
.landscape(isLandscape)
|
||||
.rotated(isRotated)
|
||||
.build();
|
||||
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements
|
||||
.getMaxCharWidth(), parsedElements.getMaxCharHeight());
|
||||
.getMinCharWidth());
|
||||
|
||||
Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
|
||||
@ -18,9 +18,9 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
@Service
|
||||
public class RulingCleaningService {
|
||||
|
||||
public CleanRulings getCleanRulings(List<Ruling> rulings, float maxCharWidth, float maxCharHeight){
|
||||
public CleanRulings getCleanRulings(List<Ruling> rulings, float minCharWidth){
|
||||
if (!rulings.isEmpty()) {
|
||||
snapPoints(rulings, maxCharWidth , maxCharHeight);
|
||||
snapPoints(rulings, minCharWidth , minCharWidth);
|
||||
}
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user