diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 2baf53a40..5fc89afc6 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2458,6 +2458,7 @@ class PartialEvaluator { height: 0, vertical: false, prevTransform: null, + prevTextRise: 0, textAdvanceScale: 0, spaceInFlowMin: 0, spaceInFlowMax: 0, @@ -2906,7 +2907,19 @@ class PartialEvaluator { return true; } - if (Math.abs(advanceY) > textContentItem.height) { + // Compensate for a textRise change (e.g. superscript/subscript dropping + // back to baseline): textRise is baked into posY/lastPosY via tsm[5] in + // getCurrentTextTransform(), scaled by the Y component of the CTM×TM + // product, which equals currentTransform[3] / textState.fontSize. + // Without this correction a superscript whose textRise exceeds the line + // height triggers a spurious EOL when the rise returns to 0. + const textRiseDelta = textState.textRise - textContentItem.prevTextRise; + const advanceYCorrected = + textRiseDelta === 0 + ? advanceY + : advanceY - + (currentTransform[3] / textState.fontSize) * textRiseDelta; + if (Math.abs(advanceYCorrected) > textContentItem.height) { appendEOL(); return true; } @@ -3068,6 +3081,7 @@ class PartialEvaluator { if (scaledDim) { // Save the position of the last visible character. textChunk.prevTransform = getCurrentTextTransform(); + textChunk.prevTextRise = textState.textRise; } const glyphUnicode = glyph.unicode; diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 1870764b4..452344cf1 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -890,3 +890,4 @@ !acroform_calculation_order.pdf !extractPages_null_in_array.pdf !issue20930.pdf +!text_rise_eol_bug.pdf diff --git a/test/pdfs/text_rise_eol_bug.pdf b/test/pdfs/text_rise_eol_bug.pdf new file mode 100644 index 000000000..4c277fe15 --- /dev/null +++ b/test/pdfs/text_rise_eol_bug.pdf @@ -0,0 +1,46 @@ +%PDF-1.7 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] + /Contents 4 0 R + /Resources << /Font << /F1 5 0 R >> >> +>> +endobj +4 0 obj +<< /Length 113 >> +stream +BT +/F1 12 Tf +100 700 Td +(E = mc) Tj +12 Ts +/F1 8 Tf +(2) Tj +0 Ts +/F1 12 Tf +( is the mass-energy equivalence.) Tj +ET +endstream +endobj +5 0 obj +<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> +endobj +xref +0 6 +0000000000 65535 f +0000000015 00000 n +0000000064 00000 n +0000000121 00000 n +0000000253 00000 n +0000000417 00000 n +trailer +<< /Size 6 /Root 1 0 R >> +startxref +487 +%%EOF diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 6d3055d01..965fd4b63 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -3985,6 +3985,27 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content without spurious EOL after a superscript (text_rise_eol_bug.pdf)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("text_rise_eol_bug.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + + // No item should carry a hasEOL flag between the superscript and the + // text that follows it. + expect(items.every(i => !i.hasEOL)).toEqual(true); + + // Full sentence must be reconstructable without a newline. + const text = mergeText(items); + expect(text).toEqual("E = mc2 is the mass-energy equivalence."); + + await loadingTask.destroy(); + }); + it("gets text content with a specific view box", async function () { const loadingTask = getDocument(buildGetDocumentParams("issue16316.pdf")); const pdfDoc = await loadingTask.promise;