mirror of
https://github.com/mozilla/pdf.js.git
synced 2026-06-24 17:05:47 +02:00
Improve the sink handling in getTextContent for Highlight annotations (PR 20019 follow-up)
Currently there's a couple of issues related to the `sink` handling: - The `Page.prototype.extractTextContent` method is invoked with options that it doesn't actually use; note1ddf6449ac/src/core/document.js (L669-L676)- When parsing "nested" textContent, i.e. /Form /XObjects, we end up wrongly treating repeated /XObjects as empty for the annotations use-case since `enqueue` is never invoked; note1ddf6449ac/src/core/evaluator.js (L3439)and1ddf6449ac/src/core/evaluator.js (L3449-L3451)- The `getTextContent` method might become ever so slightly slower by having to defer parsing at every step, given the "bad" fallback value when comparing with the `TEXT_CONTENT_CHUNK_SIZE` constant (in the API), note1ddf6449ac/src/display/api.js (L1705)and1ddf6449ac/src/core/evaluator.js (L3566)- Having the `sink` now be effectively optional, in the `getTextContent` method, does complicate the code slightly overall. To address these things this patch ensures that a `sink` will always be available, by re-using the `sinkWrapper` structure from the "nested" textContent case, and with reasonable default values.
This commit is contained in:
parent
1ddf6449ac
commit
22871eef23
@ -789,8 +789,6 @@ class Page {
|
||||
includeMarkedContent: false,
|
||||
disableNormalization: false,
|
||||
sink: null,
|
||||
viewBox: this.view,
|
||||
lang: null,
|
||||
intersector,
|
||||
}).then(() => {
|
||||
intersector.setText();
|
||||
|
||||
@ -78,6 +78,7 @@ import {
|
||||
LocalTilingPatternCache,
|
||||
RegionalImageCache,
|
||||
} from "./image_utils.js";
|
||||
import { parseMarkedContentProps, textSinkWrapper } from "./evaluator_utils.js";
|
||||
import { BaseStream } from "./base_stream.js";
|
||||
import { bidi } from "./bidi.js";
|
||||
import { ColorSpace } from "./colorspace.js";
|
||||
@ -87,7 +88,6 @@ import { getGlyphsUnicode } from "./glyphlist.js";
|
||||
import { getMetrics } from "./metrics.js";
|
||||
import { getUnicodeForGlyph } from "./unicode.js";
|
||||
import { MurmurHash3_64 } from "../shared/murmurhash3.js";
|
||||
import { parseMarkedContentProps } from "./evaluator_utils.js";
|
||||
import { PDFImage } from "./image.js";
|
||||
import { Stream } from "./stream.js";
|
||||
import { stringToPDFString } from "./string_utils.js";
|
||||
@ -2380,6 +2380,7 @@ class PartialEvaluator {
|
||||
stream = new Stream(bytes, 0, bytes.length, stream.dict);
|
||||
}
|
||||
}
|
||||
sink ??= textSinkWrapper(null);
|
||||
|
||||
const objId = stream.dict?.objId;
|
||||
const seenRefs = new RefSet(prevRefs);
|
||||
@ -3153,7 +3154,7 @@ class PartialEvaluator {
|
||||
if (batch && length < TEXT_CHUNK_BATCH_SIZE) {
|
||||
return;
|
||||
}
|
||||
sink?.enqueue(textContent, length);
|
||||
sink.enqueue(textContent, length);
|
||||
textContent.items = [];
|
||||
textContent.styles = Object.create(null);
|
||||
}
|
||||
@ -3163,7 +3164,7 @@ class PartialEvaluator {
|
||||
return new Promise(function promiseBody(resolve, reject) {
|
||||
const next = function (promise) {
|
||||
enqueueChunk(/* batch = */ true);
|
||||
Promise.all([promise, sink?.ready]).then(function () {
|
||||
Promise.all([promise, sink.ready]).then(function () {
|
||||
try {
|
||||
promiseBody(resolve, reject);
|
||||
} catch (ex) {
|
||||
@ -3409,22 +3410,7 @@ class PartialEvaluator {
|
||||
// Enqueue the `textContent` chunk before parsing the /Form
|
||||
// XObject.
|
||||
enqueueChunk();
|
||||
const sinkWrapper = {
|
||||
enqueueInvoked: false,
|
||||
|
||||
enqueue(chunk, size) {
|
||||
this.enqueueInvoked = true;
|
||||
sink.enqueue(chunk, size);
|
||||
},
|
||||
|
||||
get desiredSize() {
|
||||
return sink.desiredSize ?? 0;
|
||||
},
|
||||
|
||||
get ready() {
|
||||
return sink.ready;
|
||||
},
|
||||
};
|
||||
const sinkWrapper = textSinkWrapper(sink);
|
||||
|
||||
self
|
||||
.getTextContent({
|
||||
@ -3436,7 +3422,7 @@ class PartialEvaluator {
|
||||
: resources,
|
||||
stateManager: xObjStateManager,
|
||||
includeMarkedContent,
|
||||
sink: sink && sinkWrapper,
|
||||
sink: sinkWrapper,
|
||||
seenStyles,
|
||||
viewBox,
|
||||
lang,
|
||||
@ -3563,7 +3549,7 @@ class PartialEvaluator {
|
||||
}
|
||||
break;
|
||||
} // switch
|
||||
if (textContent.items.length >= (sink?.desiredSize ?? 1)) {
|
||||
if (textContent.items.length >= sink.desiredSize) {
|
||||
// Wait for ready, if we reach highWaterMark.
|
||||
stop = true;
|
||||
break;
|
||||
|
||||
@ -16,6 +16,28 @@
|
||||
import { Dict, Name, Ref } from "./primitives.js";
|
||||
import { FormatError, warn } from "../shared/util.js";
|
||||
|
||||
function textSinkWrapper(sink) {
|
||||
const TEXT_CONTENT_CHUNK_SIZE = 100; // Same as in `src/display/api.js`.
|
||||
const resolved = sink ? null : Promise.resolve();
|
||||
|
||||
return {
|
||||
enqueueInvoked: false,
|
||||
|
||||
enqueue(chunk, size) {
|
||||
this.enqueueInvoked = true;
|
||||
sink?.enqueue(chunk, size);
|
||||
},
|
||||
|
||||
get desiredSize() {
|
||||
return sink?.desiredSize ?? TEXT_CONTENT_CHUNK_SIZE;
|
||||
},
|
||||
|
||||
get ready() {
|
||||
return sink?.ready ?? resolved;
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function _parseVisibilityExpression(
|
||||
xref,
|
||||
array,
|
||||
@ -120,4 +142,4 @@ function parseMarkedContentProps(xref, contentProperties, resources) {
|
||||
return null;
|
||||
}
|
||||
|
||||
export { parseMarkedContentProps };
|
||||
export { parseMarkedContentProps, textSinkWrapper };
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user