mirror of
https://github.com/mozilla/pdf.js.git
synced 2026-02-08 00:21:11 +01:00
Flush the text content chunk only on real font changes (bug 2013793)
This commit is contained in:
parent
1c12b07726
commit
22b97d1741
@ -2538,7 +2538,7 @@ class PartialEvaluator {
|
||||
|
||||
const preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager);
|
||||
|
||||
let textState;
|
||||
let textState, currentTextState;
|
||||
|
||||
function pushWhitespace({
|
||||
width = 0,
|
||||
@ -2800,7 +2800,9 @@ class PartialEvaluator {
|
||||
|
||||
// When the total height of the current chunk is negative
|
||||
// then we're writing from bottom to top.
|
||||
const textOrientation = Math.sign(textContentItem.height);
|
||||
const textOrientation = Math.sign(
|
||||
textContentItem.height || textContentItem.totalHeight
|
||||
);
|
||||
if (advanceY < textOrientation * textContentItem.negativeSpaceMax) {
|
||||
if (
|
||||
Math.abs(advanceX) >
|
||||
@ -2864,7 +2866,9 @@ class PartialEvaluator {
|
||||
|
||||
// When the total width of the current chunk is negative
|
||||
// then we're writing from right to left.
|
||||
const textOrientation = Math.sign(textContentItem.width);
|
||||
const textOrientation = Math.sign(
|
||||
textContentItem.width || textContentItem.totalWidth
|
||||
);
|
||||
if (advanceX < textOrientation * textContentItem.negativeSpaceMax) {
|
||||
if (
|
||||
Math.abs(advanceY) >
|
||||
@ -2922,6 +2926,15 @@ class PartialEvaluator {
|
||||
}
|
||||
|
||||
function buildTextContentItem({ chars, extraSpacing }) {
|
||||
if (
|
||||
currentTextState !== textState &&
|
||||
(currentTextState.fontName !== textState.fontName ||
|
||||
currentTextState.fontSize !== textState.fontSize)
|
||||
) {
|
||||
flushTextContentItem();
|
||||
currentTextState = textState.clone();
|
||||
}
|
||||
|
||||
const font = textState.font;
|
||||
if (!chars) {
|
||||
// Just move according to the space we have.
|
||||
@ -3177,8 +3190,8 @@ class PartialEvaluator {
|
||||
break;
|
||||
}
|
||||
|
||||
const previousState = textState;
|
||||
textState = stateManager.state;
|
||||
currentTextState ||= textState.clone();
|
||||
const fn = operation.fn;
|
||||
args = operation.args;
|
||||
|
||||
@ -3195,7 +3208,6 @@ class PartialEvaluator {
|
||||
break;
|
||||
}
|
||||
|
||||
flushTextContentItem();
|
||||
textState.fontName = fontNameArg;
|
||||
textState.fontSize = fontSizeArg;
|
||||
next(handleSetFont(fontNameArg, null));
|
||||
@ -3552,14 +3564,10 @@ class PartialEvaluator {
|
||||
}
|
||||
break;
|
||||
case OPS.restore:
|
||||
if (
|
||||
previousState &&
|
||||
(previousState.font !== textState.font ||
|
||||
previousState.fontSize !== textState.fontSize ||
|
||||
previousState.fontName !== textState.fontName)
|
||||
) {
|
||||
flushTextContentItem();
|
||||
}
|
||||
stateManager.restore();
|
||||
break;
|
||||
case OPS.save:
|
||||
stateManager.save();
|
||||
break;
|
||||
} // switch
|
||||
if (textContent.items.length >= (sink?.desiredSize ?? 1)) {
|
||||
@ -5083,7 +5091,7 @@ class TextState {
|
||||
}
|
||||
|
||||
clone() {
|
||||
const clone = Object.create(this);
|
||||
const clone = Object.assign(Object.create(this), this);
|
||||
clone.textMatrix = this.textMatrix.slice();
|
||||
clone.textLineMatrix = this.textLineMatrix.slice();
|
||||
clone.fontMatrix = this.fontMatrix.slice();
|
||||
|
||||
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -871,3 +871,4 @@
|
||||
!page_with_number.pdf
|
||||
!page_with_number_and_link.pdf
|
||||
!Brotli-Prototype-FileA.pdf
|
||||
!bug2013793.pdf
|
||||
|
||||
BIN
test/pdfs/bug2013793.pdf
Normal file
BIN
test/pdfs/bug2013793.pdf
Normal file
Binary file not shown.
@ -4069,6 +4069,29 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets text content with some fake font changes (bug 2013793)", async function () {
|
||||
const loadingTask = getDocument(buildGetDocumentParams("bug2013793.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
expect(text)
|
||||
.toEqual(`This is a great deal of nothing. The purpose is to help in identifying a bug when the PDF
|
||||
is read by Firefox. I want to know whether any of the two words in this paragraph run
|
||||
together. If they do, I will file a bug report. The problem seems to occur somewhere
|
||||
between the 240th and 260th character in the paragraph. I should have written that much
|
||||
by now. So, here’s to squashing bugs.
|
||||
This is a great deal of nothing. The purpose is to help in identifying a bug when the
|
||||
PDF is read by Firefox. I want to know whether any of the two words in this
|
||||
paragraph run together. If they do, I will file a bug report. The problem seems to
|
||||
occur somewhere between the 240th and 260th character in the paragraph. I should
|
||||
have written that much by now. So, here’s to squashing bugs.`);
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets empty structure tree", async function () {
|
||||
const tree = await page.getStructTree();
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user