TAAS-41: TAAS Document Structure
* added more testFiles * hacked a workaround for CMMException
This commit is contained in:
parent
f08c4ced43
commit
7f0aa32d1b
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
|
||||
import java.awt.color.CMMException;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
@ -31,6 +32,7 @@ import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSNumber;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
@ -39,6 +41,7 @@ import com.knecon.fforesight.service.layoutparser.processor.classification.model
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Getter
|
||||
@ -194,8 +197,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
|
||||
|
||||
try {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor()
|
||||
.toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
|
||||
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
|
||||
rulings.addAll(path);
|
||||
}
|
||||
} catch (UnsupportedOperationException e) {
|
||||
@ -206,6 +209,21 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean isBlack(PDColor color) {
|
||||
|
||||
try {
|
||||
return color.toRGB() == 0;
|
||||
} catch (CMMException e) {
|
||||
// see https://github.com/haraldk/TwelveMonkeys/issues/124 or https://issues.apache.org/jira/browse/PDFBOX-3531
|
||||
// This is a quick and dirt hack
|
||||
// Happens for file 216.pdf
|
||||
log.debug(e.getMessage());
|
||||
return color.getComponents()[0] == 0 && color.getComponents()[1] == 0 && color.getComponents()[2] == 0 && color.getComponents()[1] == 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void writeString(String text, List<TextPosition> textPositions, boolean isParagraphStart) throws IOException {
|
||||
|
||||
@ -247,7 +265,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart));
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
@ -257,7 +275,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart));
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
@ -277,7 +295,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||
}
|
||||
} else {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart));
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
}
|
||||
}
|
||||
startIndex = i + 1;
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user