diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 6323b44..c2ac5ad 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -27,6 +27,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; @@ -51,6 +52,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; @@ -90,6 +92,7 @@ public class LayoutParsingPipeline { ObservationRegistry observationRegistry; VisualLayoutParsingAdapter visualLayoutParsingAdapter; ClarifyndClassificationService clarifyndClassificationService; + GraphicExtractorService graphicExtractorService; public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { @@ -256,9 +259,21 @@ public class LayoutParsingPipeline { List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, + pdPage, + pageNumber, + cleanRulings, + stripper.getTextPositionSequences(), + emptyTableCells, + false); + + pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) + .addAll(graphics.stream() + .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHICS, false, stripper.getPageNumber())) + .toList()); + ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER_OLD -> - redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); + case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/ImageType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/ImageType.java index 24cb5c8..b43fec2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/ImageType.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/ImageType.java @@ -9,7 +9,8 @@ public enum ImageType { SIGNATURE_VISUAL, OTHER, - OCR; + OCR, + GRAPHICS; public static ImageType fromString(String imageType) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java index 55c345c..5b1a61d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java @@ -9,10 +9,10 @@ import java.util.Map; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import lombok.RequiredArgsConstructor; @@ -20,8 +20,7 @@ import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class ImageServiceResponseAdapter { - - public Map> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) { + public Map> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) { Map> images = new HashMap<>(); imageServiceResponse.getData().forEach(imageMetadata -> { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index 7ebc737..c10cbee 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -8,10 +8,10 @@ import java.util.List; import java.util.Locale; import java.util.Objects; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; import lombok.experimental.UtilityClass; @@ -110,6 +110,7 @@ public class SearchTextWithTextPositionFactory { return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE; } + private static List mergeToBoundaries(List integers) { if (integers.isEmpty()) { @@ -125,8 +126,9 @@ public class SearchTextWithTextPositionFactory { } end = current + 1; } - if (boundaries.isEmpty()) + if (boundaries.isEmpty()) { boundaries.add(new Boundary(start, end)); + } return boundaries; } @@ -138,6 +140,7 @@ public class SearchTextWithTextPositionFactory { } } + private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) { return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition); @@ -177,7 +180,7 @@ public class SearchTextWithTextPositionFactory { } - private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) { + public Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) { float textHeight = sequence.getTextHeight() + HEIGHT_PADDING; Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(), diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/Box.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/Box.java new file mode 100644 index 0000000..3fc9f82 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/Box.java @@ -0,0 +1,162 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.graphics; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.List; +import java.util.Optional; + +public class Box { + + public double x1; + public double y1; + public double x2; + public double y2; + + + public Box(double x1, double y1, double x2, double y2) { + + this.x1 = x1; + this.y1 = y1; + this.x2 = x2; + this.y2 = y2; + } + + + public Box(Rectangle2D rectangle2D) { + + this.x1 = rectangle2D.getMinX(); + this.y1 = rectangle2D.getMinY(); + this.x2 = rectangle2D.getMaxX(); + this.y2 = rectangle2D.getMaxY(); + } + + + public double width() { + + return x2 - x1; + } + + + public double height() { + + return y2 - y1; + } + + + public double xCenter() { + + return (x2 + x1) / 2; + } + + + public double yCenter() { + + return (y2 + y1) / 2; + } + + + public double area() { + + return width() * height(); + } + + + public Box scale(double scale) { + + return new Box(x1 * scale, y1 * scale, x2 * scale, y2 * scale); + } + + + public boolean horizontallyAligned(Box other, double tol) { + + return !(other.x1 - tol > x2 || other.x2 + tol < x1); + } + + + public double yDistanceTo(Box other) { + + return Math.min(Math.abs(other.y1 - y2), Math.abs(y2 - other.y1)); + } + + + public boolean intersects(Box other, double tol) { + + return !((x2 < other.x1 - tol) || (x1 > other.x2 + tol) || (y2 < other.y1 - tol) || (y1 > other.y2 + tol)); + } + + + public boolean intersectsAndOver(Box other, double tol) { + + return (!((x2 < other.x1 - tol) || (x1 > other.x2 + tol) || (y2 < other.y1 - tol) || (y1 > other.y2 + tol))) && other.y1 > y1; + } + + + public boolean intersectsCenter(Box other, double tol) { + + return !((x2 < other.xCenter() - tol) || (x1 > other.xCenter() + tol) || (y2 < other.yCenter() - tol) || (y1 > other.yCenter() + tol)); + } + + + public Optional intersectRegion(Box other, double tol) { + + if (!intersects(other, tol)) { + return Optional.empty(); + } else { + var overlapX1 = Math.max(x1, other.x1); + var overlapY1 = Math.max(y1, other.y1); + var overlapX2 = Math.min(x2, other.x2); + var overlapY2 = Math.min(y2, other.y2); + return Optional.of(new Box(overlapX1, overlapY1, overlapX2, overlapY2)); + } + } + + + public double intersectArea(Box other, double tol) { + + return intersectRegion(other, tol).map(Box::area).orElse(0d); + } + + + public boolean intersectsAny(List others, double tol) { + + return others.stream().anyMatch(other -> intersects(other, tol)); + } + + + public boolean intersectsAnyAndOver(List others, double tol) { + + return others.stream().anyMatch(other -> intersectsAndOver(other, tol)); + } + + + public boolean intersectsCenter(List others, double tol) { + + return others.stream().anyMatch(other -> intersectsCenter(other, tol)); + } + + + public boolean contains(Box other, double tol) { + + return (x1 <= other.x1 + tol) && (y1 <= other.y1 + tol) && (x2 >= other.x2 - tol) && (y2 >= other.y2 - tol); + } + + + public Box container(Box other) { + + var minX = Math.min(x1, other.x1); + var minY = Math.min(y1, other.y1); + var maxX = Math.max(x2, other.x2); + var maxY = Math.max(y2, other.y2); + return new Box(minX, minY, maxX, maxY); + } + + + public Box transform(AffineTransform affineTransform) { + + Point2D point = affineTransform.transform(new Point2D.Double(x1, y1), null); + Point2D point2 = affineTransform.transform(new Point2D.Double(x2, y2), null); + return new Box(Math.min(point.getX(), point2.getX()), Math.min(point.getY(), point2.getY()), Math.max(point.getX(), point2.getX()), Math.max(point.getY(), point2.getY())); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/DistinctQueue.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/DistinctQueue.java new file mode 100644 index 0000000..b7844b7 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/DistinctQueue.java @@ -0,0 +1,51 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.graphics; + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Queue; +import java.util.Set; + +public class DistinctQueue { + + private Queue queue; + private Set set; + + + public DistinctQueue() { + + queue = new LinkedList<>(); + set = new HashSet<>(); + } + + + public void enqueue(T element) { + + if (!set.contains(element)) { + queue.add(element); + set.add(element); + } + } + + + public T dequeue() { + + T element = queue.poll(); + if (element != null) { + set.remove(element); + } + return element; + } + + + public boolean isEmpty() { + + return queue.isEmpty(); + } + + + public int size() { + + return queue.size(); + } + // Other methods as needed +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java new file mode 100644 index 0000000..1a42abb --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java @@ -0,0 +1,172 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.graphics; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; +import java.awt.image.BufferedImage; +import java.awt.image.DataBufferByte; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.springframework.stereotype.Service; + +import lombok.SneakyThrows; + +@Service +public class FindGraphicsRaster { + + // Pixels that are lighter then this threshold are ignored + private final static int THRESHOLD = 240; + + // DPI to render the image at, in practice sub-72 seems to risk pixels being lost + private final static int DPI = 72; + + + @SneakyThrows + public List findCCBoundingBoxes(PDDocument doc, List remove, PageInformation pageInformation) { + + var renderer = new PDFRenderer(doc); + var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY); + var imageCtm = getImageCTM(pageInformation, img.getWidth()); + return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm); + } + + + @SneakyThrows + private List findCCBoundingBoxes(BufferedImage image, List remove, int grayScaleTresh, int rescale, AffineTransform imageCTM) { + + var inverseCTM = imageCTM.createInverse(); + + var h = image.getHeight(); + var w = image.getWidth(); + var pixels = new int[w * h]; + image.getRaster().getPixels(0, 0, w, h, pixels); + remove.stream().map(rect -> inverseCTM.createTransformedShape(rect).getBounds2D()).forEach(box -> { + for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) { + for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) { + pixels[w * y + x] = grayScaleTresh; + } + } + }); + +// var image2 = createImageFromMatrix(pixels, w, h); + + return findCCBoundingBoxes(pixels, w, h, grayScaleTresh, rescale, imageCTM); + } + + + public static BufferedImage createImageFromMatrix(int[] matrix, int width, int height) { + + BufferedImage image = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY); + + byte[] pixelData = ((DataBufferByte) image.getRaster().getDataBuffer()).getData(); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + int index = y * width + x; + int pixel = matrix[index]; // Assuming each element in the matrix represents a pixel color + pixelData[index] = (byte) pixel; + } + } + + return image; + } + + + private List findCCBoundingBoxes(int[] pixels, int w, int h, int pixThreshold, int rescale, AffineTransform imageCTM) { + + DistinctQueue pixelsToExplore = new DistinctQueue<>(); + var boundingBoxes = new ArrayList(); + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + var pixelIndex = x + y * w; + if (pixels[pixelIndex] < pixThreshold) { + var minX = x; + var maxX = x; + var minY = y; + var maxY = y; + pixelsToExplore.enqueue(pixelIndex); + while (!pixelsToExplore.isEmpty()) { + var currentPixel = pixelsToExplore.dequeue(); + if (currentPixel > w) { + var lowerPixel = currentPixel - w; + if (pixels[lowerPixel] < pixThreshold) { + pixelsToExplore.enqueue(currentPixel - w); + minY = Math.min(minY, lowerPixel / w); + } + } + if (currentPixel < pixels.length - w) { + var upperPixel = currentPixel + w; + if (pixels[upperPixel] < pixThreshold) { + pixelsToExplore.enqueue(upperPixel); + maxY = Math.max(maxY, upperPixel / w); + } + } + if (currentPixel % w != 0) { + var leftPixel = currentPixel - 1; + if (pixels[leftPixel] < pixThreshold) { + pixelsToExplore.enqueue(leftPixel); + minX = Math.min(minX, leftPixel % w); + } + } + if ((currentPixel + 1) % w != 0) { + var rightPixel = currentPixel + 1; + if (pixels[rightPixel] < pixThreshold) { + pixelsToExplore.enqueue(rightPixel + 1); + maxX = Math.max(maxX, rightPixel % w); + } + } + // Set the current pixel to white so we don't visit it again. + pixels[currentPixel] = pixThreshold; + } + boundingBoxes.add(new Box(minX * rescale, minY * rescale, maxX * rescale, maxY * rescale)); + } + } + } + return boundingBoxes.stream().filter(box -> box.area() > 0).map(box -> box.transform(imageCTM)).collect(Collectors.toList()); + } + + + public AffineTransform getImageCTM(PageInformation pageInformation, int imageWidth) { + + double scalingFactor = calculateScalingFactor(pageInformation, imageWidth); + AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY()); + + AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height()); + + AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) { + case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0); + case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height()); + case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations + default -> new AffineTransform(); + }; + + // matrix multiplication is performed from right to left, so the order is reversed. + // scaling -> mirror -> rotation + AffineTransform resultMatrix = new AffineTransform(); + + resultMatrix.concatenate(rotationMatrix); + resultMatrix.concatenate(mirrorMatrix); + resultMatrix.concatenate(imageToCropBoxScaling); + return resultMatrix; + } + + + private double calculateScalingFactor(PageInformation pageInformation, int imageWidth) { + + // PDFBox always returns page height and width based on rotation + double pageWidth; + if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) { + pageWidth = pageInformation.height(); + } else { + pageWidth = pageInformation.width(); + } + + return pageWidth / imageWidth; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicBBDetector.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicBBDetector.java new file mode 100644 index 0000000..f9fbbdb --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicBBDetector.java @@ -0,0 +1,247 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.graphics; + +import java.awt.Rectangle; +import java.awt.color.CMMException; +import java.awt.geom.GeneralPath; +import java.awt.geom.Point2D; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine; +import org.apache.pdfbox.contentstream.PDFStreamEngine; +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.contentstream.operator.OperatorProcessor; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.graphics.color.PDColor; +import org.apache.pdfbox.pdmodel.graphics.image.PDImage; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class GraphicBBDetector extends PDFGraphicsStreamEngine { + + private int clipWindingRule = -1; + private final GeneralPath linePath = new GeneralPath(); + private final List bounds = new ArrayList<>(); + private final PDColor emptyPattern = new PDColor(new float[3], null); + + + public GraphicBBDetector(PDPage page, boolean ignoreWhite) { + + super(page); + + if (!ignoreWhite) { + addOperator(new NullOp("d", this)); + addOperator(new NullOp("k", this)); + addOperator(new NullOp("K", this)); + addOperator(new NullOp("g", this)); + addOperator(new NullOp("G", this)); + addOperator(new NullOp("CS", this)); + addOperator(new NullOp("cs", this)); + addOperator(new NullOp("RG", this)); + addOperator(new NullOp("rg", this)); + addOperator(new NullOp("sc", this)); + addOperator(new NullOp("SC", this)); + addOperator(new NullOp("scn", this)); + addOperator(new NullOp("SCN", this)); + } + + // Ignore text and font ops: + addOperator(new NullOp("Tf", this)); + addOperator(new NullOp("Tj", this)); + addOperator(new NullOp("TJ", this)); + addOperator(new NullOp("T*", this)); + addOperator(new NullOp("'", this)); + addOperator(new NullOp("\"", this)); + } + + + public List findGraphicBB() throws IOException { + + processPage(getPage()); + return bounds.stream().map(r -> new Box(r.x, r.y, r.x + r.width, r.y + r.height)).filter(box -> box.area() > 0).collect(Collectors.toList()); + } + + + @Override + public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) { + + linePath.moveTo((float) p0.getX(), (float) p0.getY()); + linePath.lineTo((float) p1.getX(), (float) p1.getY()); + linePath.lineTo((float) p2.getX(), (float) p2.getY()); + linePath.lineTo((float) p3.getX(), (float) p3.getY()); + linePath.closePath(); + } + + + @Override + public void drawImage(PDImage pdImage) { + // Do nothing + } + + + @Override + public void clip(int windingRule) { + + clipWindingRule = windingRule; + } + + + @Override + public void moveTo(float x, float y) { + + linePath.moveTo(x, y); + } + + + @Override + public void lineTo(float x, float y) { + + linePath.lineTo(x, y); + } + + + @Override + public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) { + + linePath.curveTo(x1, y1, x2, y2, x3, y3); + } + + + @Override + public Point2D getCurrentPoint() { + + return linePath.getCurrentPoint(); + } + + + @Override + public void closePath() { + + linePath.closePath(); + } + + + @Override + public void endPath() { + + if (clipWindingRule != -1) { + linePath.setWindingRule(clipWindingRule); + getGraphicsState().intersectClippingPath(linePath); + clipWindingRule = -1; + } + linePath.reset(); + } + + + @Override + public void strokePath() throws IOException { + + addLinePath(true, false); + linePath.reset(); + } + + + @Override + public void fillPath(int windingRule) throws IOException { + + linePath.setWindingRule(windingRule); + addLinePath(false, true); + linePath.reset(); + } + + + @Override + public void fillAndStrokePath(int windingRule) throws IOException { + + linePath.setWindingRule(windingRule); + addLinePath(true, true); + linePath.reset(); + } + + + @Override + public void shadingFill(COSName shadingName) { + + var newBound = getGraphicsState().getCurrentClippingPath().getBounds(); + if (newBound.getWidth() > 0 && newBound.getHeight() > 0) { + bounds.add(newBound); + } + } + + + private void addLinePath(boolean stroke, boolean fill) throws IOException { + + var newBound = getGraphicsState().getCurrentClippingPath().getBounds().intersection(linePath.getBounds()); + if (newBound.getWidth() > 0 && newBound.getHeight() > 0) { + + if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || // + !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) { + bounds.add(newBound); + } + +// var skipWhiteGraphic = ignoreWhite && (!stroke || isWhite(getGraphicsState().getStrokingColor())) && (!fill || isWhite(getGraphicsState().getNonStrokingColor())); +// if (!skipWhiteGraphic) { +// bounds.add(newBound); +// } + } + } + + + @SneakyThrows + private boolean isBlack(PDColor color) { + + try { + return color.toRGB() == 0; + } catch (CMMException e) { + // see https://github.com/haraldk/TwelveMonkeys/issues/124 or https://issues.apache.org/jira/browse/PDFBOX-3531 + // This is a quick and dirt hack + // Happens for file 216.pdf + log.debug(e.getMessage()); + var result = true; + for (var component : color.getComponents()) { + result = result && component == 0; + } + return result; + } + } + + + private boolean isWhite(PDColor color) throws IOException { + + return !color.isPattern() && color.toRGB() == 16777215 || color.equals(emptyPattern); + } + + + private class NullOp extends OperatorProcessor { + + private final String name; + + + public NullOp(String name, PDFStreamEngine context) { + + super(context); + this.name = name; + } + + + @Override + public void process(Operator operator, List operands) { + // Do nothing. + } + + + @Override + public String getName() { + + return name; + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java new file mode 100644 index 0000000..32faf37 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java @@ -0,0 +1,106 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.graphics; + +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; + +@Service +@RequiredArgsConstructor +public class GraphicExtractorService { + + private final GraphicsClusteringService graphicsClusteringService; + private final FindGraphicsRaster findGraphicsRaster; + + + @SneakyThrows + public List extractPathElementGraphics(PDDocument pdDocument, + PDPage pdPage, + int pageNumber, + CleanRulings cleanRulings, + List textPositionSequences, + List emptyTableCells, + boolean graphicsRaster) { + + var characterBBoxes = getCharacterBBoxes(textPositionSequences); + var tableLineBBoxes = getLineBBoxesFromTableCells(emptyTableCells); + var underLineBBoxes = getUnderlineBBoxes(cleanRulings, characterBBoxes); + var strikeThroughBBoxes = getStrikeThroughBBoxes(cleanRulings, characterBBoxes); + + GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true); + var graphicBBoxes = graphicBBDetector.findGraphicBB(); + + if (graphicsRaster) { + graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument, + characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()), + PageInformation.fromPDPage(pageNumber, pdPage))); + } + + var filteredGraphicBBoxes = graphicBBoxes.stream() + .filter(box -> !box.intersectsAny(tableLineBBoxes, 4)) + .filter(box -> !box.intersectsAny(underLineBBoxes, 4)) + .filter(box -> !box.intersectsAny(strikeThroughBBoxes, 4)) + .collect(Collectors.toList()); + + var clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14); + + return clusters.stream().filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50).toList(); + } + + + private List getCharacterBBoxes(List textPositionSequences) { + + return textPositionSequences.stream() + .map(pos -> pos.getTextPositions() + .stream() + .map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, pos)) + .collect(RectangleTransformations.collectBBox())) + .map(Box::new) + .collect(Collectors.toList()); + } + + + private List getLineBBoxesFromTableCells(List emptyTableCells) { + + List expandedTableLines = new ArrayList<>(); + + emptyTableCells.forEach(cell -> { + expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y - 1, cell.width, 2))); + expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y + cell.height - 1, cell.width, 2))); + expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x - 1, cell.y, 2, cell.height))); + expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x + cell.width - 1, cell.y, 2, cell.height))); + }); + + return expandedTableLines; + } + + + private List getUnderlineBBoxes(CleanRulings cleanRulings, List characterBBoxes) { + + return cleanRulings.getHorizontal() + .stream() + .map(h -> new Box(h.x1, h.y1, h.x2, h.y2)) + .filter(box -> box.intersectsAnyAndOver(characterBBoxes, 6)) + .collect(Collectors.toList()); + } + + + private List getStrikeThroughBBoxes(CleanRulings cleanRulings, List characterBBoxes) { + + return cleanRulings.getHorizontal().stream().map(h -> new Box(h.x1, h.y1, h.x2, h.y2)).filter(box -> box.intersectsCenter(characterBBoxes, 2)).collect(Collectors.toList()); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicsClusteringService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicsClusteringService.java new file mode 100644 index 0000000..c2e375c --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicsClusteringService.java @@ -0,0 +1,83 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.graphics; + +import java.util.ArrayList; +import java.util.List; + +import org.springframework.stereotype.Service; + +@Service +public class GraphicsClusteringService { + + public List getClusters(List boxes, double tol) { + + if (boxes.isEmpty()) { + return boxes; + } else { + // We iteratively pick a Box that intersects at least one other box and replace the + // intersecting box with a Box containing them + var currentBoxes = boxes; + var foundIntersectingBoxes = true; + while (foundIntersectingBoxes) { + foundIntersectingBoxes = false; + + // The box we are going to check to see if there are any intersecting boxes, followed by + // any boxes that we have already check + var checked = List.of(currentBoxes.get(0)); + var unchecked = currentBoxes.subList(1, currentBoxes.size()); + + while (!foundIntersectingBoxes && !unchecked.isEmpty()) { + + List intersects = new ArrayList<>(); + List nonIntersects = new ArrayList<>(); + + for (Box uncheck : unchecked) { + if (uncheck.intersects(checked.get(0), tol)) { + intersects.add(uncheck); + } else { + nonIntersects.add(uncheck); + } + } + + if (!intersects.isEmpty()) { + List combinedIntersecting = new ArrayList<>(); + combinedIntersecting.add(checked.get(0)); + combinedIntersecting.addAll(intersects); + var newBox = merge(combinedIntersecting); + + List newCurrentBoxes = new ArrayList<>(); + newCurrentBoxes.add(newBox); + newCurrentBoxes.addAll(checked.subList(1, checked.size())); + newCurrentBoxes.addAll(nonIntersects); + currentBoxes = newCurrentBoxes; + foundIntersectingBoxes = true; // Exit this loop and re-enter the outer loop + } else { + List newChecked = new ArrayList<>(); + newChecked.add(unchecked.get(0)); + newChecked.addAll(checked); + checked = newChecked; + unchecked = unchecked.subList(1, unchecked.size()); + } + } + } + return currentBoxes; + } + } + + + public Box merge(List boxes) { + + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + + for (Box box : boxes) { + minX = Math.min(minX, box.x1); + minY = Math.min(minY, box.y1); + maxX = Math.max(maxX, box.x2); + maxY = Math.max(maxY, box.y2); + } + return new Box(minX, minY, maxX, maxY); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/PageInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/PageInformation.java new file mode 100644 index 0000000..09c6f8a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/PageInformation.java @@ -0,0 +1,42 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.graphics; + +import java.awt.geom.Rectangle2D; + +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; + +public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) { + + public static PageInformation fromPDPage(int pageNum, PDPage page) { + + PDRectangle mediaBox = page.getMediaBox(); + return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()), + pageNum, + page.getRotation()); + } + + + public double height() { + + return mediabox.getHeight(); + } + + + public double width() { + + return mediabox.getWidth(); + } + + + public double minX() { + + return mediabox.getX(); + } + + + public double minY() { + + return mediabox.getY(); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 580961e..ad2ca14 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/new/ScrambledTextAfterSorting.pdf"; + String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile();