tests
This commit is contained in:
parent
3c9049dc8a
commit
ec035aca2f
@ -1,6 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
|
||||
// TODO: figure out, why this fails the build
|
||||
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
||||
|
||||
@ -11,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
@ -83,13 +83,13 @@ public class TaasBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
|
||||
boolean isListIdentifier = listIdentifierPattern.find();
|
||||
|
||||
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
|
||||
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
||||
boolean sameFont = previousTextBlock.getMostPopularWordFont()
|
||||
.equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
||||
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
|
||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
||||
@ -119,8 +119,10 @@ public class TaasBlockificationService {
|
||||
}
|
||||
alreadyMerged.add(textPageBlock);
|
||||
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
|
||||
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add))
|
||||
.toList());
|
||||
textPageBlocks.stream()
|
||||
.filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2))
|
||||
.peek(alreadyMerged::add))//
|
||||
.toList());
|
||||
}
|
||||
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
||||
}
|
||||
@ -163,8 +165,7 @@ public class TaasBlockificationService {
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
|
||||
block.getMaxY(),
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
@ -189,7 +190,6 @@ public class TaasBlockificationService {
|
||||
TextPositionSequence prev = null;
|
||||
// TODO: make static final constant
|
||||
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
@ -23,6 +23,7 @@ import java.util.WeakHashMap;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.fontbox.ttf.GlyphData;
|
||||
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||
import org.apache.fontbox.util.BoundingBox;
|
||||
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
||||
@ -184,6 +185,31 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
}
|
||||
}
|
||||
|
||||
GlyphData glyph = null;
|
||||
TrueTypeFont ttf = null;
|
||||
Float actualGlyphMinX = null;
|
||||
Float actualGlyphMaxX = null;
|
||||
Float actualGlyphWidth = null;
|
||||
if (font instanceof PDTrueTypeFont) {
|
||||
ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
|
||||
} else if (font instanceof PDType0Font) {
|
||||
PDType0Font type0Font = (PDType0Font) font;
|
||||
PDCIDFont cidFont = type0Font.getDescendantFont();
|
||||
if (cidFont instanceof PDCIDFontType2) {
|
||||
ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
|
||||
int glyphId = type0Font.codeToGID(code);
|
||||
glyph = ttf.getGlyph().getGlyph(glyphId);
|
||||
if (glyph != null && glyph.getBoundingBox() != null) {
|
||||
var lowerX = glyph.getBoundingBox().getLowerLeftX() * (fontSize / ttf.getUnitsPerEm());
|
||||
var upperX = glyph.getBoundingBox().getUpperRightX() * (fontSize / ttf.getUnitsPerEm());
|
||||
actualGlyphMinX = Math.min(lowerX, upperX);
|
||||
actualGlyphMaxX = Math.max(lowerX, upperX);
|
||||
actualGlyphWidth = actualGlyphMaxX - actualGlyphMinX;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// legacy calculations which were previously in PDFStreamEngine
|
||||
//
|
||||
@ -223,6 +249,14 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
// Text or Disp to represent if the values are in text or disp units (no glyph units are
|
||||
// saved).
|
||||
|
||||
if(actualGlyphMinX != null) {
|
||||
var oldDxDisplay = dxDisplay;
|
||||
dxDisplay = actualGlyphWidth;
|
||||
var diff = Math.abs(oldDxDisplay - dxDisplay);
|
||||
//textRenderingMatrix.setValue(2,0, textRenderingMatrix.getTranslateX() + diff/2);
|
||||
nextX -= diff;
|
||||
}
|
||||
|
||||
float glyphSpaceToTextSpaceFactor = 1 / 1000f;
|
||||
if (font instanceof PDType3Font) {
|
||||
glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX();
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
@ -7,6 +9,8 @@ import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
@ -20,7 +24,7 @@ class PageContentExtractorTest {
|
||||
@SneakyThrows
|
||||
public void testTextPositionSequenceExtraction() {
|
||||
|
||||
String fileName = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf";
|
||||
String fileName = "files/CLEAN-II-4.2.2.01_家畜残留分析法.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
@ -28,13 +32,38 @@ class PageContentExtractorTest {
|
||||
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
|
||||
textPositionPerPage.stream()
|
||||
.map(t -> t.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.map(TextPositionSequence::getRectangle)
|
||||
.map(RectangleTransformations::toRectangle2D)
|
||||
.stream().flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
||||
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
|
||||
.map(List::of)
|
||||
.toList())
|
||||
.toList(), tmpFileName);
|
||||
}
|
||||
|
||||
|
||||
public final int HEIGHT_PADDING = 2;
|
||||
private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
|
||||
|
||||
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
|
||||
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
|
||||
textPosition.getYDirAdj() - textHeight,
|
||||
textPosition.getWidthDirAdj(),
|
||||
textHeight + HEIGHT_PADDING);
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
|
||||
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
|
||||
transform.translate(0f, sequence.getPageHeight());
|
||||
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
|
||||
transform.translate(0f, sequence.getPageWidth());
|
||||
} else {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
|
||||
transform.translate(0f, sequence.getPageWidth());
|
||||
}
|
||||
transform.scale(1., -1.);
|
||||
|
||||
return transform.createTransformedShape(rectangle2D).getBounds2D();
|
||||
}
|
||||
|
||||
}
|
||||
@ -64,11 +64,11 @@ public class PdfDraw {
|
||||
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
||||
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
||||
/**PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
||||
pdDocument,
|
||||
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
||||
pageNumber,
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());**/
|
||||
}
|
||||
}
|
||||
pdDocument.save(out);
|
||||
@ -252,12 +252,12 @@ public class PdfDraw {
|
||||
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
|
||||
}
|
||||
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
|
||||
drawText(buildString(entry),
|
||||
/**drawText(buildString(entry),
|
||||
document,
|
||||
new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2),
|
||||
page.getNumber(),
|
||||
options,
|
||||
entry.getType() == NodeType.TABLE_CELL);
|
||||
entry.getType() == NodeType.TABLE_CELL);**/
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user