diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 6a2ba2986..490aca14d 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2538,7 +2538,7 @@ class PartialEvaluator { const preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager); - let textState; + let textState, currentTextState; function pushWhitespace({ width = 0, @@ -2800,7 +2800,9 @@ class PartialEvaluator { // When the total height of the current chunk is negative // then we're writing from bottom to top. - const textOrientation = Math.sign(textContentItem.height); + const textOrientation = Math.sign( + textContentItem.height || textContentItem.totalHeight + ); if (advanceY < textOrientation * textContentItem.negativeSpaceMax) { if ( Math.abs(advanceX) > @@ -2864,7 +2866,9 @@ class PartialEvaluator { // When the total width of the current chunk is negative // then we're writing from right to left. - const textOrientation = Math.sign(textContentItem.width); + const textOrientation = Math.sign( + textContentItem.width || textContentItem.totalWidth + ); if (advanceX < textOrientation * textContentItem.negativeSpaceMax) { if ( Math.abs(advanceY) > @@ -2922,6 +2926,15 @@ class PartialEvaluator { } function buildTextContentItem({ chars, extraSpacing }) { + if ( + currentTextState !== textState && + (currentTextState.fontName !== textState.fontName || + currentTextState.fontSize !== textState.fontSize) + ) { + flushTextContentItem(); + currentTextState = textState.clone(); + } + const font = textState.font; if (!chars) { // Just move according to the space we have. @@ -3177,8 +3190,8 @@ class PartialEvaluator { break; } - const previousState = textState; textState = stateManager.state; + currentTextState ||= textState.clone(); const fn = operation.fn; args = operation.args; @@ -3195,7 +3208,6 @@ class PartialEvaluator { break; } - flushTextContentItem(); textState.fontName = fontNameArg; textState.fontSize = fontSizeArg; next(handleSetFont(fontNameArg, null)); @@ -3552,14 +3564,10 @@ class PartialEvaluator { } break; case OPS.restore: - if ( - previousState && - (previousState.font !== textState.font || - previousState.fontSize !== textState.fontSize || - previousState.fontName !== textState.fontName) - ) { - flushTextContentItem(); - } + stateManager.restore(); + break; + case OPS.save: + stateManager.save(); break; } // switch if (textContent.items.length >= (sink?.desiredSize ?? 1)) { @@ -5083,7 +5091,7 @@ class TextState { } clone() { - const clone = Object.create(this); + const clone = Object.assign(Object.create(this), this); clone.textMatrix = this.textMatrix.slice(); clone.textLineMatrix = this.textLineMatrix.slice(); clone.fontMatrix = this.fontMatrix.slice(); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index f95ab95a7..7d924f753 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -871,3 +871,4 @@ !page_with_number.pdf !page_with_number_and_link.pdf !Brotli-Prototype-FileA.pdf +!bug2013793.pdf diff --git a/test/pdfs/bug2013793.pdf b/test/pdfs/bug2013793.pdf new file mode 100644 index 000000000..39878f5c7 Binary files /dev/null and b/test/pdfs/bug2013793.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 1d619f2cb..9829ae9d3 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -4069,6 +4069,29 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content with some fake font changes (bug 2013793)", async function () { + const loadingTask = getDocument(buildGetDocumentParams("bug2013793.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + expect(text) + .toEqual(`This is a great deal of nothing. The purpose is to help in identifying a bug when the PDF +is read by Firefox. I want to know whether any of the two words in this paragraph run +together. If they do, I will file a bug report. The problem seems to occur somewhere +between the 240th and 260th character in the paragraph. I should have written that much +by now. So, here’s to squashing bugs. +This is a great deal of nothing. The purpose is to help in identifying a bug when the +PDF is read by Firefox. I want to know whether any of the two words in this +paragraph run together. If they do, I will file a bug report. The problem seems to +occur somewhere between the 240th and 260th character in the paragraph. I should +have written that much by now. So, here’s to squashing bugs.`); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree();