From 0aee1d5382b99a033ca08aba7afe321081ad2f5f Mon Sep 17 00:00:00 2001 From: Yarchik Date: Thu, 2 Jul 2026 14:03:49 +0100 Subject: [PATCH] Do not drop the character after U+FFFE or U+FFFF in encodeToXmlString encodeToXmlString skips surrogate pairs with the guard `char > 0xd7ff && (char < 0xe000 || char > 0xfffd)` and then does `i++` to step over the low surrogate. That predicate is also true for U+FFFE and U+FFFF, which are single UTF-16 code units, not surrogate pairs. The `i++` then skips the character that follows them, so it is silently dropped. For example, encodeToXmlString of U+FFFF followed by "A" returned "￿" instead of "￿A". The function serializes XML text nodes and attribute values in xml_parser.js and xfa_object.js, so this corrupts round-tripped XML and XFA content. The correct test for a surrogate pair is `char > 0xffff`, since codePointAt returns a value at or above 0x10000 only for a real pair. This preserves the existing behavior for emoji, the U+FFFD boundary, and lone surrogates, and stops dropping the character after U+FFFE and U+FFFF. --- src/core/core_utils.js | 2 +- test/unit/core_utils_spec.js | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/core/core_utils.js b/src/core/core_utils.js index 5556c9e15..bbc7ef735 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -533,7 +533,7 @@ function encodeToXmlString(str) { buffer.push(str.substring(start, i)); } buffer.push(`&#x${char.toString(16).toUpperCase()};`); - if (char > 0xd7ff && (char < 0xe000 || char > 0xfffd)) { + if (char > 0xffff) { // char is represented by two u16 i++; } diff --git a/test/unit/core_utils_spec.js b/test/unit/core_utils_spec.js index a9c6bd8f0..597f2cfbf 100644 --- a/test/unit/core_utils_spec.js +++ b/test/unit/core_utils_spec.js @@ -312,6 +312,11 @@ describe("core_utils", function () { const str = "hello world"; expect(encodeToXmlString(str)).toEqual(str); }); + + it("should keep the character after U+FFFE or U+FFFF", function () { + expect(encodeToXmlString("￿A")).toEqual("￿A"); + expect(encodeToXmlString("￾B")).toEqual("￾B"); + }); }); describe("validateCSSFont", function () {