Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7d6caabbfb | ||
|
|
ec035aca2f |
@ -19,4 +19,5 @@ public class PageContents {
|
|||||||
Rectangle2D cropBox;
|
Rectangle2D cropBox;
|
||||||
Rectangle2D mediaBox;
|
Rectangle2D mediaBox;
|
||||||
List<Ruling> rulings;
|
List<Ruling> rulings;
|
||||||
|
List<Rectangle2D> positions;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -39,6 +39,8 @@ public class PageContentExtractor {
|
|||||||
stripper.setEndPage(pageNumber);
|
stripper.setEndPage(pageNumber);
|
||||||
stripper.setPdpage(pdPage);
|
stripper.setPdpage(pdPage);
|
||||||
stripper.getText(pdDocument);
|
stripper.getText(pdDocument);
|
||||||
|
var positions = stripper.getExactPositions();
|
||||||
|
|
||||||
|
|
||||||
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
||||||
.stream()
|
.stream()
|
||||||
@ -49,7 +51,7 @@ public class PageContentExtractor {
|
|||||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
|
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
|
||||||
stripper.getRulings()));
|
stripper.getRulings(), positions));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||||
|
|
||||||
|
|
||||||
// TODO: figure out, why this fails the build
|
// TODO: figure out, why this fails the build
|
||||||
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
||||||
|
|
||||||
@ -11,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -83,13 +83,13 @@ public class TaasBlockificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
|
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
|
||||||
boolean isListIdentifier = listIdentifierPattern.find();
|
boolean isListIdentifier = listIdentifierPattern.find();
|
||||||
|
|
||||||
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||||
|
|
||||||
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
boolean sameFont = previousTextBlock.getMostPopularWordFont()
|
||||||
|
.equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
||||||
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||||
|
|
||||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
||||||
@ -119,8 +119,10 @@ public class TaasBlockificationService {
|
|||||||
}
|
}
|
||||||
alreadyMerged.add(textPageBlock);
|
alreadyMerged.add(textPageBlock);
|
||||||
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
|
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
|
||||||
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add))
|
textPageBlocks.stream()
|
||||||
.toList());
|
.filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2))
|
||||||
|
.peek(alreadyMerged::add))//
|
||||||
|
.toList());
|
||||||
}
|
}
|
||||||
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
||||||
}
|
}
|
||||||
@ -163,8 +165,7 @@ public class TaasBlockificationService {
|
|||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
TextPageBlock block = (TextPageBlock) itty.next();
|
TextPageBlock block = (TextPageBlock) itty.next();
|
||||||
|
|
||||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
|
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||||
block.getMaxY(),
|
|
||||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||||
previous.add(block);
|
previous.add(block);
|
||||||
@ -189,7 +190,6 @@ public class TaasBlockificationService {
|
|||||||
TextPositionSequence prev = null;
|
TextPositionSequence prev = null;
|
||||||
// TODO: make static final constant
|
// TODO: make static final constant
|
||||||
|
|
||||||
|
|
||||||
boolean wasSplitted = false;
|
boolean wasSplitted = false;
|
||||||
Float splitX1 = null;
|
Float splitX1 = null;
|
||||||
for (TextPositionSequence word : textPositions) {
|
for (TextPositionSequence word : textPositions) {
|
||||||
|
|||||||
@ -16,13 +16,20 @@
|
|||||||
*/
|
*/
|
||||||
package com.knecon.fforesight.service.layoutparser.processor.services.parsing;
|
package com.knecon.fforesight.service.layoutparser.processor.services.parsing;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.Area;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.WeakHashMap;
|
import java.util.WeakHashMap;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.fontbox.FontBoxFont;
|
||||||
|
import org.apache.fontbox.ttf.GlyphData;
|
||||||
import org.apache.fontbox.ttf.TrueTypeFont;
|
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||||
import org.apache.fontbox.util.BoundingBox;
|
import org.apache.fontbox.util.BoundingBox;
|
||||||
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
||||||
@ -52,12 +59,14 @@ import org.apache.pdfbox.cos.COSDictionary;
|
|||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDCIDFontType0;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||||
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
||||||
@ -65,6 +74,8 @@ import org.apache.pdfbox.text.TextPosition;
|
|||||||
import org.apache.pdfbox.util.Matrix;
|
import org.apache.pdfbox.util.Matrix;
|
||||||
import org.apache.pdfbox.util.Vector;
|
import org.apache.pdfbox.util.Vector;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
|
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
|
||||||
* <p>
|
* <p>
|
||||||
@ -86,6 +97,9 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
private final GlyphList glyphList;
|
private final GlyphList glyphList;
|
||||||
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
|
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private List<Rectangle2D> exactPositions = new LinkedList<>();
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor.
|
* Constructor.
|
||||||
@ -163,6 +177,51 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
|
float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
|
||||||
Matrix textMatrix = getTextMatrix();
|
Matrix textMatrix = getTextMatrix();
|
||||||
|
|
||||||
|
TrueTypeFont ttf = null;
|
||||||
|
if (font instanceof PDTrueTypeFont) {
|
||||||
|
ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
|
||||||
|
} else if (font instanceof PDType0Font) {
|
||||||
|
PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont();
|
||||||
|
if (cidFont instanceof PDCIDFontType2) {
|
||||||
|
ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
|
||||||
|
}
|
||||||
|
} else if (font instanceof PDType1Font) {
|
||||||
|
FontBoxFont fontBoxFont = ((PDType1Font) font).getFontBoxFont();
|
||||||
|
if (fontBoxFont instanceof TrueTypeFont) {
|
||||||
|
ttf = (TrueTypeFont) fontBoxFont;
|
||||||
|
} else {
|
||||||
|
System.out.println("What do?");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ttf != null) {
|
||||||
|
Integer glyphId = null;
|
||||||
|
if (font instanceof PDTrueTypeFont) {
|
||||||
|
glyphId = ((PDTrueTypeFont) font).codeToGID(code);
|
||||||
|
} else if (font instanceof PDType0Font) {
|
||||||
|
glyphId = ((PDType0Font) font).codeToGID(code);
|
||||||
|
} else if (font instanceof PDType1Font) {
|
||||||
|
FontBoxFont fontBoxFont = ((PDType1Font) font).getFontBoxFont();
|
||||||
|
if (fontBoxFont instanceof TrueTypeFont) {
|
||||||
|
glyphId = ((TrueTypeFont) fontBoxFont).getUnicodeCmapLookup().getGlyphId(code);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (glyphId != null) {
|
||||||
|
GlyphData glyph = ttf.getGlyph().getGlyph(glyphId);
|
||||||
|
if (glyph != null) {
|
||||||
|
BoundingBox boundingBox = glyph.getBoundingBox();
|
||||||
|
Rectangle2D rect = new Rectangle2D.Double(boundingBox.getLowerLeftX(), boundingBox.getLowerLeftY(), boundingBox.getWidth(), boundingBox.getHeight());
|
||||||
|
|
||||||
|
Area area = new Area(rect);
|
||||||
|
AffineTransform affineTransform = textRenderingMatrix.createAffineTransform();
|
||||||
|
float factor = 1f / ttf.getUnitsPerEm();
|
||||||
|
affineTransform.scale(factor, factor);
|
||||||
|
exactPositions.add(area.createTransformedArea(affineTransform).getBounds2D());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
float displacementX = displacement.getX();
|
float displacementX = displacement.getX();
|
||||||
// the sorting algorithm is based on the width of the character. As the displacement
|
// the sorting algorithm is based on the width of the character. As the displacement
|
||||||
// for vertical characters doesn't provide any suitable value for it, we have to
|
// for vertical characters doesn't provide any suitable value for it, we have to
|
||||||
@ -170,15 +229,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
if (font.isVertical()) {
|
if (font.isVertical()) {
|
||||||
displacementX = font.getWidth(code) / 1000;
|
displacementX = font.getWidth(code) / 1000;
|
||||||
// there may be an additional scaling factor for true type fonts
|
// there may be an additional scaling factor for true type fonts
|
||||||
TrueTypeFont ttf = null;
|
|
||||||
if (font instanceof PDTrueTypeFont) {
|
|
||||||
ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
|
|
||||||
} else if (font instanceof PDType0Font) {
|
|
||||||
PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont();
|
|
||||||
if (cidFont instanceof PDCIDFontType2) {
|
|
||||||
ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (ttf != null && ttf.getUnitsPerEm() != 1000) {
|
if (ttf != null && ttf.getUnitsPerEm() != 1000) {
|
||||||
displacementX *= 1000f / ttf.getUnitsPerEm();
|
displacementX *= 1000f / ttf.getUnitsPerEm();
|
||||||
}
|
}
|
||||||
@ -201,11 +251,11 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
|
|
||||||
// (modified) text rendering matrix
|
// (modified) text rendering matrix
|
||||||
Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
|
Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
|
||||||
float nextX = nextTextRenderingMatrix.getTranslateX();
|
float endX = nextTextRenderingMatrix.getTranslateX();
|
||||||
float nextY = nextTextRenderingMatrix.getTranslateY();
|
float endY = nextTextRenderingMatrix.getTranslateY();
|
||||||
|
|
||||||
// (modified) width and height calculations
|
// (modified) width and height calculations
|
||||||
float dxDisplay = nextX - textRenderingMatrix.getTranslateX();
|
float dxDisplay = endX - textRenderingMatrix.getTranslateX();
|
||||||
Float fontHeight = fontHeightMap.get(font.getCOSObject());
|
Float fontHeight = fontHeightMap.get(font.getCOSObject());
|
||||||
if (fontHeight == null) {
|
if (fontHeight == null) {
|
||||||
fontHeight = computeFontHeight(font);
|
fontHeight = computeFontHeight(font);
|
||||||
@ -271,56 +321,56 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
translatedTextRenderingMatrix = textRenderingMatrix;
|
translatedTextRenderingMatrix = textRenderingMatrix;
|
||||||
} else {
|
} else {
|
||||||
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
|
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
|
||||||
nextX -= pageSize.getLowerLeftX();
|
endX -= pageSize.getLowerLeftX();
|
||||||
nextY -= pageSize.getLowerLeftY();
|
endY -= pageSize.getLowerLeftY();
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
|
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
|
||||||
if (unicodeMapping.length() == 2) {
|
if (unicodeMapping.length() == 2) {
|
||||||
processTextPosition(new TextPosition(pageRotation,
|
processTextPosition(new TextPosition(pageRotation,
|
||||||
pageSize.getWidth(),
|
pageSize.getWidth(),
|
||||||
pageSize.getHeight(),
|
pageSize.getHeight(),
|
||||||
translatedTextRenderingMatrix,
|
translatedTextRenderingMatrix,
|
||||||
nextX,
|
endX,
|
||||||
nextY,
|
endY,
|
||||||
Math.abs(dyDisplay),
|
Math.abs(dyDisplay),
|
||||||
dxDisplay,
|
dxDisplay,
|
||||||
Math.abs(spaceWidthDisplay),
|
Math.abs(spaceWidthDisplay),
|
||||||
Character.toString(unicodeMapping.charAt(0)),
|
Character.toString(unicodeMapping.charAt(0)),
|
||||||
new int[]{code},
|
new int[]{code},
|
||||||
font,
|
font,
|
||||||
fontSize,
|
fontSize,
|
||||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||||
processTextPosition(new TextPosition(pageRotation,
|
processTextPosition(new TextPosition(pageRotation,
|
||||||
pageSize.getWidth(),
|
pageSize.getWidth(),
|
||||||
pageSize.getHeight(),
|
pageSize.getHeight(),
|
||||||
translatedTextRenderingMatrix,
|
translatedTextRenderingMatrix,
|
||||||
nextX,
|
endX,
|
||||||
nextY,
|
endY,
|
||||||
Math.abs(dyDisplay),
|
Math.abs(dyDisplay),
|
||||||
dxDisplay,
|
dxDisplay,
|
||||||
Math.abs(spaceWidthDisplay),
|
Math.abs(spaceWidthDisplay),
|
||||||
Character.toString(unicodeMapping.charAt(1)),
|
Character.toString(unicodeMapping.charAt(1)),
|
||||||
new int[]{code},
|
new int[]{code},
|
||||||
font,
|
font,
|
||||||
fontSize,
|
fontSize,
|
||||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
processTextPosition(new TextPosition(pageRotation,
|
processTextPosition(new TextPosition(pageRotation,
|
||||||
pageSize.getWidth(),
|
pageSize.getWidth(),
|
||||||
pageSize.getHeight(),
|
pageSize.getHeight(),
|
||||||
translatedTextRenderingMatrix,
|
translatedTextRenderingMatrix,
|
||||||
nextX,
|
endX,
|
||||||
nextY,
|
endY,
|
||||||
Math.abs(dyDisplay),
|
Math.abs(dyDisplay),
|
||||||
dxDisplay,
|
dxDisplay,
|
||||||
Math.abs(spaceWidthDisplay),
|
Math.abs(spaceWidthDisplay),
|
||||||
unicodeMapping,
|
unicodeMapping,
|
||||||
new int[]{code},
|
new int[]{code},
|
||||||
font,
|
font,
|
||||||
fontSize,
|
fontSize,
|
||||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,12 +1,22 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
@ -20,21 +30,88 @@ class PageContentExtractorTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testTextPositionSequenceExtraction() {
|
public void testTextPositionSequenceExtraction() {
|
||||||
|
|
||||||
String fileName = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf";
|
//String fileName = "files/CLEAN-II-4.2.2.01_家畜残留分析法.pdf";
|
||||||
|
//String fileName = "files/BASF/2013-1110704.pdf";
|
||||||
|
//String fileName = "files/ImportRedactionTestFile_highlighted.pdf";
|
||||||
|
String fileName = "files/HelloWorldHelvetica.pdf";
|
||||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||||
|
|
||||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
|
||||||
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
|
PdfDraw.drawRectanglesPerPage(fileName,
|
||||||
textPositionPerPage.stream()
|
textPositionPerPage.stream()
|
||||||
.map(t -> t.getSortedTextPositionSequences()
|
//.map(t -> t.getSortedTextPositionSequences()
|
||||||
.stream()
|
.map(t -> t.getPositions())
|
||||||
.map(TextPositionSequence::getRectangle)
|
// .stream().flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
||||||
.map(RectangleTransformations::toRectangle2D)
|
|
||||||
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
|
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
|
||||||
.map(List::of)
|
//.map(List::of)
|
||||||
.toList())
|
//.toList())
|
||||||
.toList(), tmpFileName);
|
.toList(), tmpFileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public final int HEIGHT_PADDING = 2;
|
||||||
|
private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
|
||||||
|
|
||||||
|
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
|
||||||
|
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
|
||||||
|
textPosition.getYDirAdj() - textHeight,
|
||||||
|
textPosition.getWidthDirAdj(),
|
||||||
|
textHeight + HEIGHT_PADDING);
|
||||||
|
|
||||||
|
AffineTransform transform = new AffineTransform();
|
||||||
|
|
||||||
|
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
|
||||||
|
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
|
||||||
|
transform.translate(0f, sequence.getPageHeight());
|
||||||
|
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
|
||||||
|
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
|
||||||
|
transform.translate(0f, sequence.getPageWidth());
|
||||||
|
} else {
|
||||||
|
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
|
||||||
|
transform.translate(0f, sequence.getPageWidth());
|
||||||
|
}
|
||||||
|
transform.scale(1., -1.);
|
||||||
|
|
||||||
|
return transform.createTransformedShape(rectangle2D).getBounds2D();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void generatePDF() {
|
||||||
|
|
||||||
|
// Create a new PDF document
|
||||||
|
PDDocument document = new PDDocument();
|
||||||
|
|
||||||
|
// Create a blank page
|
||||||
|
PDPage page = new PDPage();
|
||||||
|
document.addPage(page);
|
||||||
|
|
||||||
|
// Load the Helvetica font
|
||||||
|
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||||
|
|
||||||
|
// Start a content stream to write text
|
||||||
|
PDPageContentStream contentStream = new PDPageContentStream(document, page);
|
||||||
|
contentStream.beginText();
|
||||||
|
|
||||||
|
// Set font and font size
|
||||||
|
contentStream.setFont(font, 12);
|
||||||
|
|
||||||
|
// Set text position
|
||||||
|
contentStream.newLineAtOffset(50, 700);
|
||||||
|
|
||||||
|
// Write the text
|
||||||
|
contentStream.showText("Hello World in Helvetica!");
|
||||||
|
|
||||||
|
// Finish writing text
|
||||||
|
contentStream.endText();
|
||||||
|
contentStream.close();
|
||||||
|
|
||||||
|
// Save the PDF
|
||||||
|
document.save("/tmp/MyPDF.pdf");
|
||||||
|
document.close();
|
||||||
|
|
||||||
|
System.out.println("PDF created successfully!");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -64,11 +64,11 @@ public class PdfDraw {
|
|||||||
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
||||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||||
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
||||||
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
/**PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
||||||
pdDocument,
|
pdDocument,
|
||||||
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
||||||
pageNumber,
|
pageNumber,
|
||||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
PdfVisualisationUtility.Options.builder().stroke(true).build());**/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pdDocument.save(out);
|
pdDocument.save(out);
|
||||||
@ -252,12 +252,12 @@ public class PdfDraw {
|
|||||||
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
|
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
|
||||||
}
|
}
|
||||||
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
|
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
|
||||||
drawText(buildString(entry),
|
/**drawText(buildString(entry),
|
||||||
document,
|
document,
|
||||||
new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2),
|
new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2),
|
||||||
page.getNumber(),
|
page.getNumber(),
|
||||||
options,
|
options,
|
||||||
entry.getType() == NodeType.TABLE_CELL);
|
entry.getType() == NodeType.TABLE_CELL);**/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user