Compare commits

...

5 Commits

Author SHA1 Message Date
Tim van der Meij
f302323c7e
Merge pull request #20627 from Snuffleupagus/ChunkedStream-onReceiveData-rm-copy
Improve progress reporting in `ChunkedStreamManager`, and prevent unnecessary data copy in `ChunkedStream.prototype.onReceiveData`
2026-02-05 21:27:42 +01:00
Tim van der Meij
a0f3528053
Merge pull request #20624 from calixteman/bug2013793
Flush the text content chunk only on real font changes (bug 2013793)
2026-02-05 21:14:49 +01:00
Jonas Jenwald
b3cd042ded Prevent unnecessary data copy in ChunkedStream.prototype.onReceiveData
This method is only invoked via `ChunkedStreamManager.prototype.sendRequest`, which currently returns data in `Uint8Array` format (since it potentially combines multiple `ArrayBuffer`s).
Hence we end up doing a short-lived, but still completely unnecessary, data copy[1] in `ChunkedStream.prototype.onReceiveData` when handling range requests. In practice this is unlikely to be a big problem by default, given that streaming is used and the (low) value of the `rangeChunkSize` API-option. (However, in custom PDF.js deployments it might affect things more.)

Given that no data copy is better than a short lived one, let's fix this small oversight and add non-production `assert`s to keep it working as intended.
This way we also improve consistency, since all other streaming and range request methods (see e.g. `BasePDFStream` and related code) only return `ArrayBuffer` data.

---
[1] Remember that `new Uint8Array(arrayBuffer)` only creates a view of the underlying `arrayBuffer`, whereas `new Uint8Array(typedArray)` actually creates a copy of the `typedArray`.
2026-02-05 16:16:36 +01:00
Jonas Jenwald
01deb085f8 Improve progress reporting in the ChunkedStreamManager
Currently there's two small bugs, which have existed around a decade, in the `loaded` property that's sent via the "DocProgress" message from the `ChunkedStreamManager.prototype.onReceiveData` method.

 - When the entire PDF has loaded the `loaded` property can become larger than the `total` property, which obviously doesn't make sense.
   This happens whenever the size of the PDF is *not* a multiple of the `rangeChunkSize` API-option, which is a very common situation.

 - When streaming is being used, the `loaded` property can become smaller than the actually loaded amount of data.
   This happens whenever the size of a streamed chunk is *not* a multiple of the `rangeChunkSize` API-option, which is a common situation.
2026-02-05 16:04:45 +01:00
calixteman
22b97d1741
Flush the text content chunk only on real font changes (bug 2013793) 2026-02-03 23:11:31 +01:00
5 changed files with 65 additions and 17 deletions

View File

@ -14,7 +14,7 @@
*/
import { arrayBuffersToBytes, MissingDataException } from "./core_utils.js";
import { assert } from "../shared/util.js";
import { assert, MathClamp } from "../shared/util.js";
import { Stream } from "./stream.js";
class ChunkedStream extends Stream {
@ -70,6 +70,12 @@ class ChunkedStream extends Stream {
throw new Error(`Bad end offset: ${end}`);
}
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
assert(
chunk instanceof ArrayBuffer,
"onReceiveData - expected an ArrayBuffer."
);
}
this.bytes.set(new Uint8Array(chunk), begin);
const beginChunk = Math.floor(begin / chunkSize);
const endChunk = Math.floor((end - 1) / chunkSize) + 1;
@ -85,6 +91,12 @@ class ChunkedStream extends Stream {
let position = this.progressiveDataLength;
const beginChunk = Math.floor(position / this.chunkSize);
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
assert(
data instanceof ArrayBuffer,
"onReceiveProgressiveData - expected an ArrayBuffer."
);
}
this.bytes.set(new Uint8Array(data), position);
position += data.byteLength;
this.progressiveDataLength = position;
@ -310,7 +322,7 @@ class ChunkedStreamManager {
if (this.aborted) {
return; // Ignoring any data after abort.
}
this.onReceiveData({ chunk: data, begin });
this.onReceiveData({ chunk: data.buffer, begin });
});
}
@ -511,7 +523,11 @@ class ChunkedStreamManager {
}
this.msgHandler.send("DocProgress", {
loaded: stream.numChunksLoaded * chunkSize,
loaded: MathClamp(
stream.numChunksLoaded * chunkSize,
stream.progressiveDataLength,
length
),
total: length,
});
}

View File

@ -2538,7 +2538,7 @@ class PartialEvaluator {
const preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager);
let textState;
let textState, currentTextState;
function pushWhitespace({
width = 0,
@ -2800,7 +2800,9 @@ class PartialEvaluator {
// When the total height of the current chunk is negative
// then we're writing from bottom to top.
const textOrientation = Math.sign(textContentItem.height);
const textOrientation = Math.sign(
textContentItem.height || textContentItem.totalHeight
);
if (advanceY < textOrientation * textContentItem.negativeSpaceMax) {
if (
Math.abs(advanceX) >
@ -2864,7 +2866,9 @@ class PartialEvaluator {
// When the total width of the current chunk is negative
// then we're writing from right to left.
const textOrientation = Math.sign(textContentItem.width);
const textOrientation = Math.sign(
textContentItem.width || textContentItem.totalWidth
);
if (advanceX < textOrientation * textContentItem.negativeSpaceMax) {
if (
Math.abs(advanceY) >
@ -2922,6 +2926,15 @@ class PartialEvaluator {
}
function buildTextContentItem({ chars, extraSpacing }) {
if (
currentTextState !== textState &&
(currentTextState.fontName !== textState.fontName ||
currentTextState.fontSize !== textState.fontSize)
) {
flushTextContentItem();
currentTextState = textState.clone();
}
const font = textState.font;
if (!chars) {
// Just move according to the space we have.
@ -3177,8 +3190,8 @@ class PartialEvaluator {
break;
}
const previousState = textState;
textState = stateManager.state;
currentTextState ||= textState.clone();
const fn = operation.fn;
args = operation.args;
@ -3195,7 +3208,6 @@ class PartialEvaluator {
break;
}
flushTextContentItem();
textState.fontName = fontNameArg;
textState.fontSize = fontSizeArg;
next(handleSetFont(fontNameArg, null));
@ -3552,14 +3564,10 @@ class PartialEvaluator {
}
break;
case OPS.restore:
if (
previousState &&
(previousState.font !== textState.font ||
previousState.fontSize !== textState.fontSize ||
previousState.fontName !== textState.fontName)
) {
flushTextContentItem();
}
stateManager.restore();
break;
case OPS.save:
stateManager.save();
break;
} // switch
if (textContent.items.length >= (sink?.desiredSize ?? 1)) {
@ -5083,7 +5091,7 @@ class TextState {
}
clone() {
const clone = Object.create(this);
const clone = Object.assign(Object.create(this), this);
clone.textMatrix = this.textMatrix.slice();
clone.textLineMatrix = this.textLineMatrix.slice();
clone.fontMatrix = this.fontMatrix.slice();

View File

@ -871,3 +871,4 @@
!page_with_number.pdf
!page_with_number_and_link.pdf
!Brotli-Prototype-FileA.pdf
!bug2013793.pdf

BIN
test/pdfs/bug2013793.pdf Normal file

Binary file not shown.

View File

@ -4069,6 +4069,29 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy();
});
it("gets text content with some fake font changes (bug 2013793)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("bug2013793.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent({
disableNormalization: true,
});
const text = mergeText(items);
expect(text)
.toEqual(`This is a great deal of nothing. The purpose is to help in identifying a bug when the PDF
is read by Firefox. I want to know whether any of the two words in this paragraph run
together. If they do, I will file a bug report. The problem seems to occur somewhere
between the 240th and 260th character in the paragraph. I should have written that much
by now. So, heres to squashing bugs.
This is a great deal of nothing. The purpose is to help in identifying a bug when the
PDF is read by Firefox. I want to know whether any of the two words in this
paragraph run together. If they do, I will file a bug report. The problem seems to
occur somewhere between the 240th and 260th character in the paragraph. I should
have written that much by now. So, heres to squashing bugs.`);
await loadingTask.destroy();
});
it("gets empty structure tree", async function () {
const tree = await page.getStructTree();