Merge branch 'RED-8826' into 'main'

Red 8826

See merge request fforesight/layout-parser!138
This commit is contained in:
Dominique Eifländer 2024-04-23 13:12:51 +02:00
commit 58acbab85f
13 changed files with 913 additions and 19 deletions

View File

@ -27,6 +27,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -51,6 +52,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
@ -90,6 +92,7 @@ public class LayoutParsingPipeline {
ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
ClarifyndClassificationService clarifyndClassificationService;
GraphicExtractorService graphicExtractorService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -256,9 +259,21 @@ public class LayoutParsingPipeline {
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
pdPage,
pageNumber,
cleanRulings,
stripper.getTextPositionSequences(),
emptyTableCells,
false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
.toList());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);

View File

@ -9,7 +9,8 @@ public enum ImageType {
SIGNATURE_VISUAL,
OTHER,
OCR;
OCR,
GRAPHIC;
public static ImageType fromString(String imageType) {
@ -19,6 +20,7 @@ public enum ImageType {
case "formula" -> ImageType.FORMULA;
case "signature" -> ImageType.SIGNATURE;
case "ocr" -> ImageType.OCR;
case "graphic" -> ImageType.GRAPHIC;
default -> ImageType.OTHER;
};
}

View File

@ -9,10 +9,10 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import lombok.RequiredArgsConstructor;
@ -20,8 +20,7 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class ImageServiceResponseAdapter {
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) {
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) {
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
imageServiceResponse.getData().forEach(imageMetadata -> {

View File

@ -3,14 +3,15 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
import java.util.List;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
@ -21,7 +22,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RedactManagerClassificationService {
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
@ -52,14 +52,25 @@ public class RedactManagerClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
if (page.getImages()
.stream()
.filter(image -> image.getImageType().equals(ImageType.GRAPHIC))
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()

View File

@ -8,10 +8,10 @@ import java.util.List;
import java.util.Locale;
import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import lombok.experimental.UtilityClass;
@ -110,6 +110,7 @@ public class SearchTextWithTextPositionFactory {
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
}
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
if (integers.isEmpty()) {
@ -125,8 +126,9 @@ public class SearchTextWithTextPositionFactory {
}
end = current + 1;
}
if (boundaries.isEmpty())
if (boundaries.isEmpty()) {
boundaries.add(new Boundary(start, end));
}
return boundaries;
}
@ -138,6 +140,7 @@ public class SearchTextWithTextPositionFactory {
}
}
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
@ -177,7 +180,7 @@ public class SearchTextWithTextPositionFactory {
}
private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
public Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),

View File

@ -0,0 +1,162 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Optional;
public class Box {
public double x1;
public double y1;
public double x2;
public double y2;
public Box(double x1, double y1, double x2, double y2) {
this.x1 = x1;
this.y1 = y1;
this.x2 = x2;
this.y2 = y2;
}
public Box(Rectangle2D rectangle2D) {
this.x1 = rectangle2D.getMinX();
this.y1 = rectangle2D.getMinY();
this.x2 = rectangle2D.getMaxX();
this.y2 = rectangle2D.getMaxY();
}
public double width() {
return x2 - x1;
}
public double height() {
return y2 - y1;
}
public double xCenter() {
return (x2 + x1) / 2;
}
public double yCenter() {
return (y2 + y1) / 2;
}
public double area() {
return width() * height();
}
public Box scale(double scale) {
return new Box(x1 * scale, y1 * scale, x2 * scale, y2 * scale);
}
public boolean horizontallyAligned(Box other, double tol) {
return !(other.x1 - tol > x2 || other.x2 + tol < x1);
}
public double yDistanceTo(Box other) {
return Math.min(Math.abs(other.y1 - y2), Math.abs(y2 - other.y1));
}
public boolean intersects(Box other, double tol) {
return !((x2 < other.x1 - tol) || (x1 > other.x2 + tol) || (y2 < other.y1 - tol) || (y1 > other.y2 + tol));
}
public boolean intersectsAndOver(Box other, double tol) {
return (!((x2 < other.x1 - tol) || (x1 > other.x2 + tol) || (y2 < other.y1 - tol) || (y1 > other.y2 + tol))) && other.y1 > y1;
}
public boolean intersectsCenter(Box other, double tol) {
return !((x2 < other.xCenter() - tol) || (x1 > other.xCenter() + tol) || (y2 < other.yCenter() - tol) || (y1 > other.yCenter() + tol));
}
public Optional<Box> intersectRegion(Box other, double tol) {
if (!intersects(other, tol)) {
return Optional.empty();
} else {
var overlapX1 = Math.max(x1, other.x1);
var overlapY1 = Math.max(y1, other.y1);
var overlapX2 = Math.min(x2, other.x2);
var overlapY2 = Math.min(y2, other.y2);
return Optional.of(new Box(overlapX1, overlapY1, overlapX2, overlapY2));
}
}
public double intersectArea(Box other, double tol) {
return intersectRegion(other, tol).map(Box::area).orElse(0d);
}
public boolean intersectsAny(List<Box> others, double tol) {
return others.stream().anyMatch(other -> intersects(other, tol));
}
public boolean intersectsAnyAndOver(List<Box> others, double tol) {
return others.stream().anyMatch(other -> intersectsAndOver(other, tol));
}
public boolean intersectsCenter(List<Box> others, double tol) {
return others.stream().anyMatch(other -> intersectsCenter(other, tol));
}
public boolean contains(Box other, double tol) {
return (x1 <= other.x1 + tol) && (y1 <= other.y1 + tol) && (x2 >= other.x2 - tol) && (y2 >= other.y2 - tol);
}
public Box container(Box other) {
var minX = Math.min(x1, other.x1);
var minY = Math.min(y1, other.y1);
var maxX = Math.max(x2, other.x2);
var maxY = Math.max(y2, other.y2);
return new Box(minX, minY, maxX, maxY);
}
public Box transform(AffineTransform affineTransform) {
Point2D point = affineTransform.transform(new Point2D.Double(x1, y1), null);
Point2D point2 = affineTransform.transform(new Point2D.Double(x2, y2), null);
return new Box(Math.min(point.getX(), point2.getX()), Math.min(point.getY(), point2.getY()), Math.max(point.getX(), point2.getX()), Math.max(point.getY(), point2.getY()));
}
}

View File

@ -0,0 +1,51 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
public class DistinctQueue<T> {
private Queue<T> queue;
private Set<T> set;
public DistinctQueue() {
queue = new LinkedList<>();
set = new HashSet<>();
}
public void enqueue(T element) {
if (!set.contains(element)) {
queue.add(element);
set.add(element);
}
}
public T dequeue() {
T element = queue.poll();
if (element != null) {
set.remove(element);
}
return element;
}
public boolean isEmpty() {
return queue.isEmpty();
}
public int size() {
return queue.size();
}
// Other methods as needed
}

View File

@ -0,0 +1,172 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.awt.image.DataBufferByte;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service;
import lombok.SneakyThrows;
@Service
public class FindGraphicsRaster {
// Pixels that are lighter then this threshold are ignored
private final static int THRESHOLD = 240;
// DPI to render the image at, in practice sub-72 seems to risk pixels being lost
private final static int DPI = 72;
@SneakyThrows
public List<Box> findCCBoundingBoxes(PDDocument doc, List<Rectangle2D> remove, PageInformation pageInformation) {
var renderer = new PDFRenderer(doc);
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
var imageCtm = getImageCTM(pageInformation, img.getWidth());
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
}
@SneakyThrows
private List<Box> findCCBoundingBoxes(BufferedImage image, List<Rectangle2D> remove, int grayScaleTresh, int rescale, AffineTransform imageCTM) {
var inverseCTM = imageCTM.createInverse();
var h = image.getHeight();
var w = image.getWidth();
var pixels = new int[w * h];
image.getRaster().getPixels(0, 0, w, h, pixels);
remove.stream().map(rect -> inverseCTM.createTransformedShape(rect).getBounds2D()).forEach(box -> {
for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) {
for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) {
pixels[w * y + x] = grayScaleTresh;
}
}
});
// var image2 = createImageFromMatrix(pixels, w, h);
return findCCBoundingBoxes(pixels, w, h, grayScaleTresh, rescale, imageCTM);
}
public static BufferedImage createImageFromMatrix(int[] matrix, int width, int height) {
BufferedImage image = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY);
byte[] pixelData = ((DataBufferByte) image.getRaster().getDataBuffer()).getData();
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int index = y * width + x;
int pixel = matrix[index]; // Assuming each element in the matrix represents a pixel color
pixelData[index] = (byte) pixel;
}
}
return image;
}
private List<Box> findCCBoundingBoxes(int[] pixels, int w, int h, int pixThreshold, int rescale, AffineTransform imageCTM) {
DistinctQueue<Integer> pixelsToExplore = new DistinctQueue<>();
var boundingBoxes = new ArrayList<Box>();
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
var pixelIndex = x + y * w;
if (pixels[pixelIndex] < pixThreshold) {
var minX = x;
var maxX = x;
var minY = y;
var maxY = y;
pixelsToExplore.enqueue(pixelIndex);
while (!pixelsToExplore.isEmpty()) {
var currentPixel = pixelsToExplore.dequeue();
if (currentPixel > w) {
var lowerPixel = currentPixel - w;
if (pixels[lowerPixel] < pixThreshold) {
pixelsToExplore.enqueue(currentPixel - w);
minY = Math.min(minY, lowerPixel / w);
}
}
if (currentPixel < pixels.length - w) {
var upperPixel = currentPixel + w;
if (pixels[upperPixel] < pixThreshold) {
pixelsToExplore.enqueue(upperPixel);
maxY = Math.max(maxY, upperPixel / w);
}
}
if (currentPixel % w != 0) {
var leftPixel = currentPixel - 1;
if (pixels[leftPixel] < pixThreshold) {
pixelsToExplore.enqueue(leftPixel);
minX = Math.min(minX, leftPixel % w);
}
}
if ((currentPixel + 1) % w != 0) {
var rightPixel = currentPixel + 1;
if (pixels[rightPixel] < pixThreshold) {
pixelsToExplore.enqueue(rightPixel + 1);
maxX = Math.max(maxX, rightPixel % w);
}
}
// Set the current pixel to white so we don't visit it again.
pixels[currentPixel] = pixThreshold;
}
boundingBoxes.add(new Box(minX * rescale, minY * rescale, maxX * rescale, maxY * rescale));
}
}
}
return boundingBoxes.stream().filter(box -> box.area() > 0).map(box -> box.transform(imageCTM)).collect(Collectors.toList());
}
public AffineTransform getImageCTM(PageInformation pageInformation, int imageWidth) {
double scalingFactor = calculateScalingFactor(pageInformation, imageWidth);
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
default -> new AffineTransform();
};
// matrix multiplication is performed from right to left, so the order is reversed.
// scaling -> mirror -> rotation
AffineTransform resultMatrix = new AffineTransform();
resultMatrix.concatenate(rotationMatrix);
resultMatrix.concatenate(mirrorMatrix);
resultMatrix.concatenate(imageToCropBoxScaling);
return resultMatrix;
}
private double calculateScalingFactor(PageInformation pageInformation, int imageWidth) {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
}
return pageWidth / imageWidth;
}
}

View File

@ -0,0 +1,247 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
import java.awt.Rectangle;
import java.awt.color.CMMException;
import java.awt.geom.GeneralPath;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class GraphicBBDetector extends PDFGraphicsStreamEngine {
private int clipWindingRule = -1;
private final GeneralPath linePath = new GeneralPath();
private final List<Rectangle> bounds = new ArrayList<>();
private final PDColor emptyPattern = new PDColor(new float[3], null);
public GraphicBBDetector(PDPage page, boolean ignoreWhite) {
super(page);
if (!ignoreWhite) {
addOperator(new NullOp("d", this));
addOperator(new NullOp("k", this));
addOperator(new NullOp("K", this));
addOperator(new NullOp("g", this));
addOperator(new NullOp("G", this));
addOperator(new NullOp("CS", this));
addOperator(new NullOp("cs", this));
addOperator(new NullOp("RG", this));
addOperator(new NullOp("rg", this));
addOperator(new NullOp("sc", this));
addOperator(new NullOp("SC", this));
addOperator(new NullOp("scn", this));
addOperator(new NullOp("SCN", this));
}
// Ignore text and font ops:
addOperator(new NullOp("Tf", this));
addOperator(new NullOp("Tj", this));
addOperator(new NullOp("TJ", this));
addOperator(new NullOp("T*", this));
addOperator(new NullOp("'", this));
addOperator(new NullOp("\"", this));
}
public List<Box> findGraphicBB() throws IOException {
processPage(getPage());
return bounds.stream().map(r -> new Box(r.x, r.y, r.x + r.width, r.y + r.height)).filter(box -> box.area() > 0).collect(Collectors.toList());
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) {
linePath.moveTo((float) p0.getX(), (float) p0.getY());
linePath.lineTo((float) p1.getX(), (float) p1.getY());
linePath.lineTo((float) p2.getX(), (float) p2.getY());
linePath.lineTo((float) p3.getX(), (float) p3.getY());
linePath.closePath();
}
@Override
public void drawImage(PDImage pdImage) {
// Do nothing
}
@Override
public void clip(int windingRule) {
clipWindingRule = windingRule;
}
@Override
public void moveTo(float x, float y) {
linePath.moveTo(x, y);
}
@Override
public void lineTo(float x, float y) {
linePath.lineTo(x, y);
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
linePath.curveTo(x1, y1, x2, y2, x3, y3);
}
@Override
public Point2D getCurrentPoint() {
return linePath.getCurrentPoint();
}
@Override
public void closePath() {
linePath.closePath();
}
@Override
public void endPath() {
if (clipWindingRule != -1) {
linePath.setWindingRule(clipWindingRule);
getGraphicsState().intersectClippingPath(linePath);
clipWindingRule = -1;
}
linePath.reset();
}
@Override
public void strokePath() throws IOException {
addLinePath(true, false);
linePath.reset();
}
@Override
public void fillPath(int windingRule) throws IOException {
linePath.setWindingRule(windingRule);
addLinePath(false, true);
linePath.reset();
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException {
linePath.setWindingRule(windingRule);
addLinePath(true, true);
linePath.reset();
}
@Override
public void shadingFill(COSName shadingName) {
var newBound = getGraphicsState().getCurrentClippingPath().getBounds();
if (newBound.getWidth() > 0 && newBound.getHeight() > 0) {
bounds.add(newBound);
}
}
private void addLinePath(boolean stroke, boolean fill) throws IOException {
var newBound = getGraphicsState().getCurrentClippingPath().getBounds().intersection(linePath.getBounds());
if (newBound.getWidth() > 0 && newBound.getHeight() > 0) {
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
bounds.add(newBound);
}
// var skipWhiteGraphic = ignoreWhite && (!stroke || isWhite(getGraphicsState().getStrokingColor())) && (!fill || isWhite(getGraphicsState().getNonStrokingColor()));
// if (!skipWhiteGraphic) {
// bounds.add(newBound);
// }
}
}
@SneakyThrows
private boolean isBlack(PDColor color) {
try {
return color.toRGB() == 0;
} catch (CMMException e) {
// see https://github.com/haraldk/TwelveMonkeys/issues/124 or https://issues.apache.org/jira/browse/PDFBOX-3531
// This is a quick and dirt hack
// Happens for file 216.pdf
log.debug(e.getMessage());
var result = true;
for (var component : color.getComponents()) {
result = result && component == 0;
}
return result;
}
}
private boolean isWhite(PDColor color) throws IOException {
return !color.isPattern() && color.toRGB() == 16777215 || color.equals(emptyPattern);
}
private final class NullOp extends OperatorProcessor {
private final String name;
private NullOp(String name, PDFStreamEngine context) {
super(context);
this.name = name;
}
@Override
public void process(Operator operator, List<COSBase> operands) {
// Do nothing.
}
@Override
public String getName() {
return name;
}
}
}

View File

@ -0,0 +1,107 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@Service
@RequiredArgsConstructor
public class GraphicExtractorService {
private final GraphicsClusteringService graphicsClusteringService;
private final FindGraphicsRaster findGraphicsRaster;
@SneakyThrows
public List<Box> extractPathElementGraphics(PDDocument pdDocument,
PDPage pdPage,
int pageNumber,
CleanRulings cleanRulings,
List<TextPositionSequence> textPositionSequences,
List<Cell> emptyTableCells,
boolean graphicsRaster) {
var characterBBoxes = getCharacterBBoxes(textPositionSequences);
var tableLineBBoxes = getLineBBoxesFromTableCells(emptyTableCells);
var underLineBBoxes = getUnderlineBBoxes(cleanRulings, characterBBoxes);
var strikeThroughBBoxes = getStrikeThroughBBoxes(cleanRulings, characterBBoxes);
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
var graphicBBoxes = graphicBBDetector.findGraphicBB();
if (graphicsRaster) {
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
PageInformation.fromPDPage(pageNumber, pdPage)));
}
var filteredGraphicBBoxes = graphicBBoxes.stream()
.filter(box -> !box.intersectsAny(tableLineBBoxes, 4))
.filter(box -> !box.intersectsAny(underLineBBoxes, 4))
.filter(box -> !box.intersectsAny(strikeThroughBBoxes, 4))
.collect(Collectors.toList());
var clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
return clusters.stream().filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50).toList();
}
private List<Box> getCharacterBBoxes(List<TextPositionSequence> textPositionSequences) {
return textPositionSequences.stream()
.map(pos -> pos.getTextPositions()
.stream()
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, pos))
.collect(RectangleTransformations.collectBBox()))
.map(Box::new)
.collect(Collectors.toList());
}
private List<Box> getLineBBoxesFromTableCells(List<Cell> emptyTableCells) {
List<Box> expandedTableLines = new ArrayList<>();
emptyTableCells.forEach(cell -> {
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y - 1, cell.width, 2)));
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y + cell.height - 1, cell.width, 2)));
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x - 1, cell.y, 2, cell.height)));
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x + cell.width - 1, cell.y, 2, cell.height)));
});
return expandedTableLines;
}
private List<Box> getUnderlineBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
return cleanRulings.getHorizontal()
.stream()
.map(h -> new Box(h.x1, h.y1, h.x2, h.y2))
.filter(box -> box.intersectsAnyAndOver(characterBBoxes, 6))
.collect(Collectors.toList());
}
private List<Box> getStrikeThroughBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
return cleanRulings.getHorizontal().stream().map(h -> new Box(h.x1, h.y1, h.x2, h.y2)).filter(box -> box.intersectsCenter(characterBBoxes, 2)).collect(Collectors.toList());
}
}

View File

@ -0,0 +1,83 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
@Service
public class GraphicsClusteringService {
public List<Box> getClusters(List<Box> boxes, double tol) {
if (boxes.isEmpty()) {
return boxes;
} else {
// We iteratively pick a Box that intersects at least one other box and replace the
// intersecting box with a Box containing them
var currentBoxes = boxes;
var foundIntersectingBoxes = true;
while (foundIntersectingBoxes) {
foundIntersectingBoxes = false;
// The box we are going to check to see if there are any intersecting boxes, followed by
// any boxes that we have already check
var checked = List.of(currentBoxes.get(0));
var unchecked = currentBoxes.subList(1, currentBoxes.size());
while (!foundIntersectingBoxes && !unchecked.isEmpty()) {
List<Box> intersects = new ArrayList<>();
List<Box> nonIntersects = new ArrayList<>();
for (Box uncheck : unchecked) {
if (uncheck.intersects(checked.get(0), tol)) {
intersects.add(uncheck);
} else {
nonIntersects.add(uncheck);
}
}
if (!intersects.isEmpty()) {
List<Box> combinedIntersecting = new ArrayList<>();
combinedIntersecting.add(checked.get(0));
combinedIntersecting.addAll(intersects);
var newBox = merge(combinedIntersecting);
List<Box> newCurrentBoxes = new ArrayList<>();
newCurrentBoxes.add(newBox);
newCurrentBoxes.addAll(checked.subList(1, checked.size()));
newCurrentBoxes.addAll(nonIntersects);
currentBoxes = newCurrentBoxes;
foundIntersectingBoxes = true; // Exit this loop and re-enter the outer loop
} else {
List<Box> newChecked = new ArrayList<>();
newChecked.add(unchecked.get(0));
newChecked.addAll(checked);
checked = newChecked;
unchecked = unchecked.subList(1, unchecked.size());
}
}
}
return currentBoxes;
}
}
public Box merge(List<Box> boxes) {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Box box : boxes) {
minX = Math.min(minX, box.x1);
minY = Math.min(minY, box.y1);
maxX = Math.max(maxX, box.x2);
maxY = Math.max(maxY, box.y2);
}
return new Box(minX, minY, maxX, maxY);
}
}

View File

@ -0,0 +1,42 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) {
public static PageInformation fromPDPage(int pageNum, PDPage page) {
PDRectangle mediaBox = page.getMediaBox();
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
pageNum,
page.getRotation());
}
public double height() {
return mediabox.getHeight();
}
public double width() {
return mediabox.getWidth();
}
public double minX() {
return mediabox.getX();
}
public double minY() {
return mediabox.getY();
}
}

View File

@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/new/ScrambledTextAfterSorting.pdf";
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();