tests
This commit is contained in:
parent
ec035aca2f
commit
7d6caabbfb
@ -19,4 +19,5 @@ public class PageContents {
|
||||
Rectangle2D cropBox;
|
||||
Rectangle2D mediaBox;
|
||||
List<Ruling> rulings;
|
||||
List<Rectangle2D> positions;
|
||||
}
|
||||
|
||||
@ -39,6 +39,8 @@ public class PageContentExtractor {
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
var positions = stripper.getExactPositions();
|
||||
|
||||
|
||||
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
||||
.stream()
|
||||
@ -49,7 +51,7 @@ public class PageContentExtractor {
|
||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
|
||||
stripper.getRulings()));
|
||||
stripper.getRulings(), positions));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -16,13 +16,19 @@
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.parsing;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.WeakHashMap;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.fontbox.FontBoxFont;
|
||||
import org.apache.fontbox.ttf.GlyphData;
|
||||
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||
import org.apache.fontbox.util.BoundingBox;
|
||||
@ -53,12 +59,14 @@ import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType0;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
||||
@ -66,6 +74,8 @@ import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.apache.pdfbox.util.Vector;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
/**
|
||||
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
|
||||
* <p>
|
||||
@ -87,6 +97,9 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
private final GlyphList glyphList;
|
||||
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
|
||||
|
||||
@Getter
|
||||
private List<Rectangle2D> exactPositions = new LinkedList<>();
|
||||
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
@ -164,6 +177,51 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
|
||||
Matrix textMatrix = getTextMatrix();
|
||||
|
||||
TrueTypeFont ttf = null;
|
||||
if (font instanceof PDTrueTypeFont) {
|
||||
ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
|
||||
} else if (font instanceof PDType0Font) {
|
||||
PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont();
|
||||
if (cidFont instanceof PDCIDFontType2) {
|
||||
ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
|
||||
}
|
||||
} else if (font instanceof PDType1Font) {
|
||||
FontBoxFont fontBoxFont = ((PDType1Font) font).getFontBoxFont();
|
||||
if (fontBoxFont instanceof TrueTypeFont) {
|
||||
ttf = (TrueTypeFont) fontBoxFont;
|
||||
} else {
|
||||
System.out.println("What do?");
|
||||
}
|
||||
}
|
||||
|
||||
if (ttf != null) {
|
||||
Integer glyphId = null;
|
||||
if (font instanceof PDTrueTypeFont) {
|
||||
glyphId = ((PDTrueTypeFont) font).codeToGID(code);
|
||||
} else if (font instanceof PDType0Font) {
|
||||
glyphId = ((PDType0Font) font).codeToGID(code);
|
||||
} else if (font instanceof PDType1Font) {
|
||||
FontBoxFont fontBoxFont = ((PDType1Font) font).getFontBoxFont();
|
||||
if (fontBoxFont instanceof TrueTypeFont) {
|
||||
glyphId = ((TrueTypeFont) fontBoxFont).getUnicodeCmapLookup().getGlyphId(code);
|
||||
}
|
||||
}
|
||||
|
||||
if (glyphId != null) {
|
||||
GlyphData glyph = ttf.getGlyph().getGlyph(glyphId);
|
||||
if (glyph != null) {
|
||||
BoundingBox boundingBox = glyph.getBoundingBox();
|
||||
Rectangle2D rect = new Rectangle2D.Double(boundingBox.getLowerLeftX(), boundingBox.getLowerLeftY(), boundingBox.getWidth(), boundingBox.getHeight());
|
||||
|
||||
Area area = new Area(rect);
|
||||
AffineTransform affineTransform = textRenderingMatrix.createAffineTransform();
|
||||
float factor = 1f / ttf.getUnitsPerEm();
|
||||
affineTransform.scale(factor, factor);
|
||||
exactPositions.add(area.createTransformedArea(affineTransform).getBounds2D());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float displacementX = displacement.getX();
|
||||
// the sorting algorithm is based on the width of the character. As the displacement
|
||||
// for vertical characters doesn't provide any suitable value for it, we have to
|
||||
@ -171,45 +229,11 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
if (font.isVertical()) {
|
||||
displacementX = font.getWidth(code) / 1000;
|
||||
// there may be an additional scaling factor for true type fonts
|
||||
TrueTypeFont ttf = null;
|
||||
if (font instanceof PDTrueTypeFont) {
|
||||
ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
|
||||
} else if (font instanceof PDType0Font) {
|
||||
PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont();
|
||||
if (cidFont instanceof PDCIDFontType2) {
|
||||
ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
|
||||
}
|
||||
}
|
||||
if (ttf != null && ttf.getUnitsPerEm() != 1000) {
|
||||
displacementX *= 1000f / ttf.getUnitsPerEm();
|
||||
}
|
||||
}
|
||||
|
||||
GlyphData glyph = null;
|
||||
TrueTypeFont ttf = null;
|
||||
Float actualGlyphMinX = null;
|
||||
Float actualGlyphMaxX = null;
|
||||
Float actualGlyphWidth = null;
|
||||
if (font instanceof PDTrueTypeFont) {
|
||||
ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
|
||||
} else if (font instanceof PDType0Font) {
|
||||
PDType0Font type0Font = (PDType0Font) font;
|
||||
PDCIDFont cidFont = type0Font.getDescendantFont();
|
||||
if (cidFont instanceof PDCIDFontType2) {
|
||||
ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
|
||||
int glyphId = type0Font.codeToGID(code);
|
||||
glyph = ttf.getGlyph().getGlyph(glyphId);
|
||||
if (glyph != null && glyph.getBoundingBox() != null) {
|
||||
var lowerX = glyph.getBoundingBox().getLowerLeftX() * (fontSize / ttf.getUnitsPerEm());
|
||||
var upperX = glyph.getBoundingBox().getUpperRightX() * (fontSize / ttf.getUnitsPerEm());
|
||||
actualGlyphMinX = Math.min(lowerX, upperX);
|
||||
actualGlyphMaxX = Math.max(lowerX, upperX);
|
||||
actualGlyphWidth = actualGlyphMaxX - actualGlyphMinX;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// legacy calculations which were previously in PDFStreamEngine
|
||||
//
|
||||
@ -227,11 +251,11 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
|
||||
// (modified) text rendering matrix
|
||||
Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
|
||||
float nextX = nextTextRenderingMatrix.getTranslateX();
|
||||
float nextY = nextTextRenderingMatrix.getTranslateY();
|
||||
float endX = nextTextRenderingMatrix.getTranslateX();
|
||||
float endY = nextTextRenderingMatrix.getTranslateY();
|
||||
|
||||
// (modified) width and height calculations
|
||||
float dxDisplay = nextX - textRenderingMatrix.getTranslateX();
|
||||
float dxDisplay = endX - textRenderingMatrix.getTranslateX();
|
||||
Float fontHeight = fontHeightMap.get(font.getCOSObject());
|
||||
if (fontHeight == null) {
|
||||
fontHeight = computeFontHeight(font);
|
||||
@ -249,14 +273,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
// Text or Disp to represent if the values are in text or disp units (no glyph units are
|
||||
// saved).
|
||||
|
||||
if(actualGlyphMinX != null) {
|
||||
var oldDxDisplay = dxDisplay;
|
||||
dxDisplay = actualGlyphWidth;
|
||||
var diff = Math.abs(oldDxDisplay - dxDisplay);
|
||||
//textRenderingMatrix.setValue(2,0, textRenderingMatrix.getTranslateX() + diff/2);
|
||||
nextX -= diff;
|
||||
}
|
||||
|
||||
float glyphSpaceToTextSpaceFactor = 1 / 1000f;
|
||||
if (font instanceof PDType3Font) {
|
||||
glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX();
|
||||
@ -305,56 +321,56 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
translatedTextRenderingMatrix = textRenderingMatrix;
|
||||
} else {
|
||||
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
|
||||
nextX -= pageSize.getLowerLeftX();
|
||||
nextY -= pageSize.getLowerLeftY();
|
||||
endX -= pageSize.getLowerLeftX();
|
||||
endY -= pageSize.getLowerLeftY();
|
||||
}
|
||||
|
||||
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
|
||||
if (unicodeMapping.length() == 2) {
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(0)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
endX,
|
||||
endY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(0)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(1)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
endX,
|
||||
endY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(1)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
} else {
|
||||
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
unicodeMapping,
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
endX,
|
||||
endY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
unicodeMapping,
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -5,6 +5,12 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@ -24,18 +30,22 @@ class PageContentExtractorTest {
|
||||
@SneakyThrows
|
||||
public void testTextPositionSequenceExtraction() {
|
||||
|
||||
String fileName = "files/CLEAN-II-4.2.2.01_家畜残留分析法.pdf";
|
||||
//String fileName = "files/CLEAN-II-4.2.2.01_家畜残留分析法.pdf";
|
||||
//String fileName = "files/BASF/2013-1110704.pdf";
|
||||
//String fileName = "files/ImportRedactionTestFile_highlighted.pdf";
|
||||
String fileName = "files/HelloWorldHelvetica.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
|
||||
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
|
||||
PdfDraw.drawRectanglesPerPage(fileName,
|
||||
textPositionPerPage.stream()
|
||||
.map(t -> t.getSortedTextPositionSequences()
|
||||
.stream().flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
||||
//.map(t -> t.getSortedTextPositionSequences()
|
||||
.map(t -> t.getPositions())
|
||||
// .stream().flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
||||
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
|
||||
.map(List::of)
|
||||
.toList())
|
||||
//.map(List::of)
|
||||
//.toList())
|
||||
.toList(), tmpFileName);
|
||||
}
|
||||
|
||||
@ -66,4 +76,42 @@ class PageContentExtractorTest {
|
||||
return transform.createTransformedShape(rectangle2D).getBounds2D();
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void generatePDF() {
|
||||
|
||||
// Create a new PDF document
|
||||
PDDocument document = new PDDocument();
|
||||
|
||||
// Create a blank page
|
||||
PDPage page = new PDPage();
|
||||
document.addPage(page);
|
||||
|
||||
// Load the Helvetica font
|
||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||
|
||||
// Start a content stream to write text
|
||||
PDPageContentStream contentStream = new PDPageContentStream(document, page);
|
||||
contentStream.beginText();
|
||||
|
||||
// Set font and font size
|
||||
contentStream.setFont(font, 12);
|
||||
|
||||
// Set text position
|
||||
contentStream.newLineAtOffset(50, 700);
|
||||
|
||||
// Write the text
|
||||
contentStream.showText("Hello World in Helvetica!");
|
||||
|
||||
// Finish writing text
|
||||
contentStream.endText();
|
||||
contentStream.close();
|
||||
|
||||
// Save the PDF
|
||||
document.save("/tmp/MyPDF.pdf");
|
||||
document.close();
|
||||
|
||||
System.out.println("PDF created successfully!");
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user