TAAS-41: TAAS Document Structure

* added more testFiles
* hacked a workaround for CMMException
This commit is contained in:
Kilian Schuettler 2023-06-15 15:01:36 +02:00 committed by Timo Bejan
parent f08c4ced43
commit 7f0aa32d1b
7 changed files with 23 additions and 5 deletions

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
import java.awt.color.CMMException;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
@ -31,6 +32,7 @@ import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
@ -39,6 +41,7 @@ import com.knecon.fforesight.service.layoutparser.processor.classification.model
import lombok.Getter;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Getter
@ -194,8 +197,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
try {
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor()
.toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
rulings.addAll(path);
}
} catch (UnsupportedOperationException e) {
@ -206,6 +209,21 @@ public class PDFLinesTextStripper extends PDFTextStripper {
}
@SneakyThrows
private boolean isBlack(PDColor color) {
try {
return color.toRGB() == 0;
} catch (CMMException e) {
// see https://github.com/haraldk/TwelveMonkeys/issues/124 or https://issues.apache.org/jira/browse/PDFBOX-3531
// This is a quick and dirt hack
// Happens for file 216.pdf
log.debug(e.getMessage());
return color.getComponents()[0] == 0 && color.getComponents()[1] == 0 && color.getComponents()[2] == 0 && color.getComponents()[1] == 1;
}
}
@Override
public void writeString(String text, List<TextPosition> textPositions, boolean isParagraphStart) throws IOException {
@ -247,7 +265,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart));
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
}
startIndex = i;
}
@ -257,7 +275,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart));
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
}
startIndex = i;
}
@ -277,7 +295,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
}
} else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart));
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
}
}
startIndex = i + 1;