diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java index f9d00ef..c02de5c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java @@ -19,4 +19,5 @@ public class PageContents { Rectangle2D cropBox; Rectangle2D mediaBox; List rulings; + List positions; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java index dde3b94..7e77815 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java @@ -39,6 +39,8 @@ public class PageContentExtractor { stripper.setEndPage(pageNumber); stripper.setPdpage(pdPage); stripper.getText(pdDocument); + var positions = stripper.getExactPositions(); + Map> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences() .stream() @@ -49,7 +51,7 @@ public class PageContentExtractor { textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences, RectangleTransformations.toRectangle2D(pdPage.getCropBox()), RectangleTransformations.toRectangle2D(pdPage.getMediaBox()), - stripper.getRulings())); + stripper.getRulings(), positions)); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java index c1a2923..d78d605 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java @@ -16,13 +16,19 @@ */ package com.knecon.fforesight.service.layoutparser.processor.services.parsing; +import java.awt.geom.AffineTransform; +import java.awt.geom.Area; +import java.awt.geom.Rectangle2D; import java.io.IOException; import java.io.InputStream; +import java.util.LinkedList; +import java.util.List; import java.util.Map; import java.util.WeakHashMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.fontbox.FontBoxFont; import org.apache.fontbox.ttf.GlyphData; import org.apache.fontbox.ttf.TrueTypeFont; import org.apache.fontbox.util.BoundingBox; @@ -53,12 +59,14 @@ import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.font.PDCIDFont; +import org.apache.pdfbox.pdmodel.font.PDCIDFontType0; import org.apache.pdfbox.pdmodel.font.PDCIDFontType2; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; import org.apache.pdfbox.pdmodel.font.PDSimpleFont; import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.PDType3Font; import org.apache.pdfbox.pdmodel.font.encoding.GlyphList; import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; @@ -66,6 +74,8 @@ import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.util.Matrix; import org.apache.pdfbox.util.Vector; +import lombok.Getter; + /** * LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper. *

@@ -87,6 +97,9 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { private final GlyphList glyphList; private final Map fontHeightMap = new WeakHashMap(); + @Getter + private List exactPositions = new LinkedList<>(); + /** * Constructor. @@ -164,6 +177,51 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f; Matrix textMatrix = getTextMatrix(); + TrueTypeFont ttf = null; + if (font instanceof PDTrueTypeFont) { + ttf = ((PDTrueTypeFont) font).getTrueTypeFont(); + } else if (font instanceof PDType0Font) { + PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont(); + if (cidFont instanceof PDCIDFontType2) { + ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont(); + } + } else if (font instanceof PDType1Font) { + FontBoxFont fontBoxFont = ((PDType1Font) font).getFontBoxFont(); + if (fontBoxFont instanceof TrueTypeFont) { + ttf = (TrueTypeFont) fontBoxFont; + } else { + System.out.println("What do?"); + } + } + + if (ttf != null) { + Integer glyphId = null; + if (font instanceof PDTrueTypeFont) { + glyphId = ((PDTrueTypeFont) font).codeToGID(code); + } else if (font instanceof PDType0Font) { + glyphId = ((PDType0Font) font).codeToGID(code); + } else if (font instanceof PDType1Font) { + FontBoxFont fontBoxFont = ((PDType1Font) font).getFontBoxFont(); + if (fontBoxFont instanceof TrueTypeFont) { + glyphId = ((TrueTypeFont) fontBoxFont).getUnicodeCmapLookup().getGlyphId(code); + } + } + + if (glyphId != null) { + GlyphData glyph = ttf.getGlyph().getGlyph(glyphId); + if (glyph != null) { + BoundingBox boundingBox = glyph.getBoundingBox(); + Rectangle2D rect = new Rectangle2D.Double(boundingBox.getLowerLeftX(), boundingBox.getLowerLeftY(), boundingBox.getWidth(), boundingBox.getHeight()); + + Area area = new Area(rect); + AffineTransform affineTransform = textRenderingMatrix.createAffineTransform(); + float factor = 1f / ttf.getUnitsPerEm(); + affineTransform.scale(factor, factor); + exactPositions.add(area.createTransformedArea(affineTransform).getBounds2D()); + } + } + } + float displacementX = displacement.getX(); // the sorting algorithm is based on the width of the character. As the displacement // for vertical characters doesn't provide any suitable value for it, we have to @@ -171,45 +229,11 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { if (font.isVertical()) { displacementX = font.getWidth(code) / 1000; // there may be an additional scaling factor for true type fonts - TrueTypeFont ttf = null; - if (font instanceof PDTrueTypeFont) { - ttf = ((PDTrueTypeFont) font).getTrueTypeFont(); - } else if (font instanceof PDType0Font) { - PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont(); - if (cidFont instanceof PDCIDFontType2) { - ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont(); - } - } if (ttf != null && ttf.getUnitsPerEm() != 1000) { displacementX *= 1000f / ttf.getUnitsPerEm(); } } - GlyphData glyph = null; - TrueTypeFont ttf = null; - Float actualGlyphMinX = null; - Float actualGlyphMaxX = null; - Float actualGlyphWidth = null; - if (font instanceof PDTrueTypeFont) { - ttf = ((PDTrueTypeFont) font).getTrueTypeFont(); - } else if (font instanceof PDType0Font) { - PDType0Font type0Font = (PDType0Font) font; - PDCIDFont cidFont = type0Font.getDescendantFont(); - if (cidFont instanceof PDCIDFontType2) { - ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont(); - int glyphId = type0Font.codeToGID(code); - glyph = ttf.getGlyph().getGlyph(glyphId); - if (glyph != null && glyph.getBoundingBox() != null) { - var lowerX = glyph.getBoundingBox().getLowerLeftX() * (fontSize / ttf.getUnitsPerEm()); - var upperX = glyph.getBoundingBox().getUpperRightX() * (fontSize / ttf.getUnitsPerEm()); - actualGlyphMinX = Math.min(lowerX, upperX); - actualGlyphMaxX = Math.max(lowerX, upperX); - actualGlyphWidth = actualGlyphMaxX - actualGlyphMinX; - } - - } - } - // // legacy calculations which were previously in PDFStreamEngine // @@ -227,11 +251,11 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { // (modified) text rendering matrix Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space - float nextX = nextTextRenderingMatrix.getTranslateX(); - float nextY = nextTextRenderingMatrix.getTranslateY(); + float endX = nextTextRenderingMatrix.getTranslateX(); + float endY = nextTextRenderingMatrix.getTranslateY(); // (modified) width and height calculations - float dxDisplay = nextX - textRenderingMatrix.getTranslateX(); + float dxDisplay = endX - textRenderingMatrix.getTranslateX(); Float fontHeight = fontHeightMap.get(font.getCOSObject()); if (fontHeight == null) { fontHeight = computeFontHeight(font); @@ -249,14 +273,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { // Text or Disp to represent if the values are in text or disp units (no glyph units are // saved). - if(actualGlyphMinX != null) { - var oldDxDisplay = dxDisplay; - dxDisplay = actualGlyphWidth; - var diff = Math.abs(oldDxDisplay - dxDisplay); - //textRenderingMatrix.setValue(2,0, textRenderingMatrix.getTranslateX() + diff/2); - nextX -= diff; - } - float glyphSpaceToTextSpaceFactor = 1 / 1000f; if (font instanceof PDType3Font) { glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX(); @@ -305,56 +321,56 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { translatedTextRenderingMatrix = textRenderingMatrix; } else { translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix); - nextX -= pageSize.getLowerLeftX(); - nextY -= pageSize.getLowerLeftY(); + endX -= pageSize.getLowerLeftX(); + endY -= pageSize.getLowerLeftY(); } // This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf if (unicodeMapping.length() == 2) { processTextPosition(new TextPosition(pageRotation, - pageSize.getWidth(), - pageSize.getHeight(), - translatedTextRenderingMatrix, - nextX, - nextY, - Math.abs(dyDisplay), - dxDisplay, - Math.abs(spaceWidthDisplay), - Character.toString(unicodeMapping.charAt(0)), - new int[]{code}, - font, - fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + pageSize.getWidth(), + pageSize.getHeight(), + translatedTextRenderingMatrix, + endX, + endY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + Character.toString(unicodeMapping.charAt(0)), + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); processTextPosition(new TextPosition(pageRotation, - pageSize.getWidth(), - pageSize.getHeight(), - translatedTextRenderingMatrix, - nextX, - nextY, - Math.abs(dyDisplay), - dxDisplay, - Math.abs(spaceWidthDisplay), - Character.toString(unicodeMapping.charAt(1)), - new int[]{code}, - font, - fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + pageSize.getWidth(), + pageSize.getHeight(), + translatedTextRenderingMatrix, + endX, + endY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + Character.toString(unicodeMapping.charAt(1)), + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); } else { processTextPosition(new TextPosition(pageRotation, - pageSize.getWidth(), - pageSize.getHeight(), - translatedTextRenderingMatrix, - nextX, - nextY, - Math.abs(dyDisplay), - dxDisplay, - Math.abs(spaceWidthDisplay), - unicodeMapping, - new int[]{code}, - font, - fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + pageSize.getWidth(), + pageSize.getHeight(), + translatedTextRenderingMatrix, + endX, + endY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + unicodeMapping, + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java index 50ac9e5..a2c9afc 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java @@ -5,6 +5,12 @@ import java.awt.geom.Rectangle2D; import java.nio.file.Path; import java.util.List; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -24,18 +30,22 @@ class PageContentExtractorTest { @SneakyThrows public void testTextPositionSequenceExtraction() { - String fileName = "files/CLEAN-II-4.2.2.01_家畜残留分析法.pdf"; + //String fileName = "files/CLEAN-II-4.2.2.01_家畜残留分析法.pdf"; + //String fileName = "files/BASF/2013-1110704.pdf"; + //String fileName = "files/ImportRedactionTestFile_highlighted.pdf"; + String fileName = "files/HelloWorldHelvetica.pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); - PdfDraw.drawRectanglesPerPageNumberedByLine(fileName, + PdfDraw.drawRectanglesPerPage(fileName, textPositionPerPage.stream() - .map(t -> t.getSortedTextPositionSequences() - .stream().flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence))) + //.map(t -> t.getSortedTextPositionSequences() + .map(t -> t.getPositions()) + // .stream().flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence))) //.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight())) - .map(List::of) - .toList()) + //.map(List::of) + //.toList()) .toList(), tmpFileName); } @@ -66,4 +76,42 @@ class PageContentExtractorTest { return transform.createTransformedShape(rectangle2D).getBounds2D(); } + @Test + @SneakyThrows + public void generatePDF() { + + // Create a new PDF document + PDDocument document = new PDDocument(); + + // Create a blank page + PDPage page = new PDPage(); + document.addPage(page); + + // Load the Helvetica font + PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + + // Start a content stream to write text + PDPageContentStream contentStream = new PDPageContentStream(document, page); + contentStream.beginText(); + + // Set font and font size + contentStream.setFont(font, 12); + + // Set text position + contentStream.newLineAtOffset(50, 700); + + // Write the text + contentStream.showText("Hello World in Helvetica!"); + + // Finish writing text + contentStream.endText(); + contentStream.close(); + + // Save the PDF + document.save("/tmp/MyPDF.pdf"); + document.close(); + + System.out.println("PDF created successfully!"); + } + } \ No newline at end of file