diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java index 07d648c..e6bee7e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; +import java.awt.color.CMMException; import java.awt.geom.Point2D; import java.io.IOException; import java.util.ArrayList; @@ -31,6 +32,7 @@ import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.text.TextPosition; import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; @@ -39,6 +41,7 @@ import com.knecon.fforesight.service.layoutparser.processor.classification.model import lombok.Getter; import lombok.Setter; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Getter @@ -194,8 +197,8 @@ public class PDFLinesTextStripper extends PDFTextStripper { private void addVisibleRulings(List path, boolean stroke) throws IOException { try { - if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor() - .toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) { + if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || // + !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) { rulings.addAll(path); } } catch (UnsupportedOperationException e) { @@ -206,6 +209,21 @@ public class PDFLinesTextStripper extends PDFTextStripper { } + @SneakyThrows + private boolean isBlack(PDColor color) { + + try { + return color.toRGB() == 0; + } catch (CMMException e) { + // see https://github.com/haraldk/TwelveMonkeys/issues/124 or https://issues.apache.org/jira/browse/PDFBOX-3531 + // This is a quick and dirt hack + // Happens for file 216.pdf + log.debug(e.getMessage()); + return color.getComponents()[0] == 0 && color.getComponents()[1] == 0 && color.getComponents()[2] == 0 && color.getComponents()[1] == 1; + } + } + + @Override public void writeString(String text, List textPositions, boolean isParagraphStart) throws IOException { @@ -247,7 +265,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) .getUnicode() .equals("\t")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; } @@ -257,7 +275,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) .getUnicode() .equals("\t")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; } @@ -277,7 +295,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { textPositionSequences.get(textPositionSequences.size() - 1).add(t); } } else { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } } startIndex = i + 1; diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/211.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/211.pdf new file mode 100644 index 0000000..2f84f92 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/211.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/216.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/216.pdf new file mode 100644 index 0000000..f3e4a5c Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/216.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf new file mode 100644 index 0000000..564320d Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 Page 6.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 Page 6.pdf new file mode 100644 index 0000000..530d2c7 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 Page 6.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 page 12.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 page 12.pdf new file mode 100644 index 0000000..d44ad6f Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 page 12.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf new file mode 100644 index 0000000..40218a6 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf differ