From ec035aca2ff6a9cb0d70a18273aadb8ea356c2de Mon Sep 17 00:00:00 2001 From: maverickstuder Date: Tue, 13 Feb 2024 18:04:22 +0100 Subject: [PATCH] tests --- .../TaasBlockificationService.java | 16 ++++---- .../parsing/LegacyPDFStreamEngine.java | 34 +++++++++++++++++ .../services/PageContentExtractorTest.java | 37 +++++++++++++++++-- .../server/utils/visualizations/PdfDraw.java | 8 ++-- 4 files changed, 79 insertions(+), 16 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java index 287d2ba..7231971 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java @@ -1,6 +1,5 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockification; - // TODO: figure out, why this fails the build // import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING; @@ -11,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; + import org.springframework.stereotype.Service; import java.util.*; @@ -83,13 +83,13 @@ public class TaasBlockificationService { continue; } - Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText()); boolean isListIdentifier = listIdentifierPattern.find(); boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER; - boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize(); + boolean sameFont = previousTextBlock.getMostPopularWordFont() + .equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize(); // boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD; @@ -119,8 +119,10 @@ public class TaasBlockificationService { } alreadyMerged.add(textPageBlock); textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock), - textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add)) - .toList()); + textPageBlocks.stream() + .filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)) + .peek(alreadyMerged::add))// + .toList()); } return textBlocksToMerge.stream().map(TextPageBlock::merge).toList(); } @@ -163,8 +165,7 @@ public class TaasBlockificationService { while (itty.hasNext()) { TextPageBlock block = (TextPageBlock) itty.next(); - if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold( - block.getMaxY(), + if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { previous.add(block); @@ -189,7 +190,6 @@ public class TaasBlockificationService { TextPositionSequence prev = null; // TODO: make static final constant - boolean wasSplitted = false; Float splitX1 = null; for (TextPositionSequence word : textPositions) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java index ff2e665..c1a2923 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java @@ -23,6 +23,7 @@ import java.util.WeakHashMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.fontbox.ttf.GlyphData; import org.apache.fontbox.ttf.TrueTypeFont; import org.apache.fontbox.util.BoundingBox; import org.apache.pdfbox.contentstream.PDFStreamEngine; @@ -184,6 +185,31 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { } } + GlyphData glyph = null; + TrueTypeFont ttf = null; + Float actualGlyphMinX = null; + Float actualGlyphMaxX = null; + Float actualGlyphWidth = null; + if (font instanceof PDTrueTypeFont) { + ttf = ((PDTrueTypeFont) font).getTrueTypeFont(); + } else if (font instanceof PDType0Font) { + PDType0Font type0Font = (PDType0Font) font; + PDCIDFont cidFont = type0Font.getDescendantFont(); + if (cidFont instanceof PDCIDFontType2) { + ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont(); + int glyphId = type0Font.codeToGID(code); + glyph = ttf.getGlyph().getGlyph(glyphId); + if (glyph != null && glyph.getBoundingBox() != null) { + var lowerX = glyph.getBoundingBox().getLowerLeftX() * (fontSize / ttf.getUnitsPerEm()); + var upperX = glyph.getBoundingBox().getUpperRightX() * (fontSize / ttf.getUnitsPerEm()); + actualGlyphMinX = Math.min(lowerX, upperX); + actualGlyphMaxX = Math.max(lowerX, upperX); + actualGlyphWidth = actualGlyphMaxX - actualGlyphMinX; + } + + } + } + // // legacy calculations which were previously in PDFStreamEngine // @@ -223,6 +249,14 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { // Text or Disp to represent if the values are in text or disp units (no glyph units are // saved). + if(actualGlyphMinX != null) { + var oldDxDisplay = dxDisplay; + dxDisplay = actualGlyphWidth; + var diff = Math.abs(oldDxDisplay - dxDisplay); + //textRenderingMatrix.setValue(2,0, textRenderingMatrix.getTranslateX() + diff/2); + nextX -= diff; + } + float glyphSpaceToTextSpaceFactor = 1 / 1000f; if (font instanceof PDType3Font) { glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java index cbaa195..50ac9e5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java @@ -1,5 +1,7 @@ package com.knecon.fforesight.service.layoutparser.server.services; +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; import java.nio.file.Path; import java.util.List; @@ -7,6 +9,8 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; @@ -20,7 +24,7 @@ class PageContentExtractorTest { @SneakyThrows public void testTextPositionSequenceExtraction() { - String fileName = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf"; + String fileName = "files/CLEAN-II-4.2.2.01_家畜残留分析法.pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); @@ -28,13 +32,38 @@ class PageContentExtractorTest { PdfDraw.drawRectanglesPerPageNumberedByLine(fileName, textPositionPerPage.stream() .map(t -> t.getSortedTextPositionSequences() - .stream() - .map(TextPositionSequence::getRectangle) - .map(RectangleTransformations::toRectangle2D) + .stream().flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence))) //.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight())) .map(List::of) .toList()) .toList(), tmpFileName); } + + public final int HEIGHT_PADDING = 2; + private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) { + + float textHeight = sequence.getTextHeight() + HEIGHT_PADDING; + Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(), + textPosition.getYDirAdj() - textHeight, + textPosition.getWidthDirAdj(), + textHeight + HEIGHT_PADDING); + + AffineTransform transform = new AffineTransform(); + + if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) { + transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f); + transform.translate(0f, sequence.getPageHeight()); + } else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) { + transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f); + transform.translate(0f, sequence.getPageWidth()); + } else { + transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f); + transform.translate(0f, sequence.getPageWidth()); + } + transform.scale(1., -1.); + + return transform.createTransformedShape(rectangle2D).getBounds2D(); + } + } \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java index 5576017..7132881 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java @@ -64,11 +64,11 @@ public class PdfDraw { var rectanglesInLine = rectanglesOnPage.get(lineNumber); PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build()); double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY()); - PdfVisualisationUtility.drawText(String.format("%d", lineNumber), + /**PdfVisualisationUtility.drawText(String.format("%d", lineNumber), pdDocument, new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2), pageNumber, - PdfVisualisationUtility.Options.builder().stroke(true).build()); + PdfVisualisationUtility.Options.builder().stroke(true).build());**/ } } pdDocument.save(out); @@ -252,12 +252,12 @@ public class PdfDraw { rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10); } drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options); - drawText(buildString(entry), + /**drawText(buildString(entry), document, new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2), page.getNumber(), options, - entry.getType() == NodeType.TABLE_CELL); + entry.getType() == NodeType.TABLE_CELL);**/ } }