mirror of
https://github.com/mozilla/pdf.js.git
synced 2026-06-22 16:05:56 +02:00
Merge pull request #20976 from calixteman/bidi_tests
Add the bidi tests coming from BidiTest.txt and BidiCharacterTest.txt
This commit is contained in:
commit
484518614d
@ -15,6 +15,9 @@
|
|||||||
|
|
||||||
import { warn } from "../shared/util.js";
|
import { warn } from "../shared/util.js";
|
||||||
|
|
||||||
|
// Implements a subset of the Unicode Bidirectional Algorithm (UBA).
|
||||||
|
// Specification: https://www.unicode.org/reports/tr9/tr9-48.html
|
||||||
|
|
||||||
// Character types for symbols from 0000 to 00FF.
|
// Character types for symbols from 0000 to 00FF.
|
||||||
// Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
// Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||||
// prettier-ignore
|
// prettier-ignore
|
||||||
@ -297,25 +300,42 @@ function bidi(str, startLevel = -1, vertical = false) {
|
|||||||
text if the text on both sides has the same direction. European and Arabic
|
text if the text on both sides has the same direction. European and Arabic
|
||||||
numbers are treated as though they were R. Start-of-level-run (sor) and
|
numbers are treated as though they were R. Start-of-level-run (sor) and
|
||||||
end-of-level-run (eor) are used at level run boundaries.
|
end-of-level-run (eor) are used at level run boundaries.
|
||||||
|
See https://www.unicode.org/reports/tr9/tr9-48.html#N1
|
||||||
*/
|
*/
|
||||||
for (i = 0; i < strLength; ++i) {
|
for (i = 0; i < strLength; ++i) {
|
||||||
if (types[i] === "ON") {
|
if (types[i] === "ON") {
|
||||||
const end = findUnequal(types, i + 1, "ON");
|
const end = findUnequal(types, i + 1, "ON");
|
||||||
|
|
||||||
|
// Scan left past non-strong types to find the nearest strong context
|
||||||
|
// (L, R, EN, or AN), falling back to sor at the level-run boundary.
|
||||||
let before = sor;
|
let before = sor;
|
||||||
if (i > 0) {
|
for (let j = i - 1; j >= 0; j--) {
|
||||||
before = types[i - 1];
|
const tt = types[j];
|
||||||
|
if (tt === "L") {
|
||||||
|
before = "L";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (tt === "R" || tt === "EN" || tt === "AN") {
|
||||||
|
before = "R";
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Scan right past non-strong types to find the nearest strong context,
|
||||||
|
// falling back to eor at the level-run boundary.
|
||||||
let after = eor;
|
let after = eor;
|
||||||
if (end + 1 < strLength) {
|
for (let j = end; j < strLength; j++) {
|
||||||
after = types[end + 1];
|
const tt = types[j];
|
||||||
}
|
if (tt === "L") {
|
||||||
if (before !== "L") {
|
after = "L";
|
||||||
before = "R";
|
break;
|
||||||
}
|
}
|
||||||
if (after !== "L") {
|
if (tt === "R" || tt === "EN" || tt === "AN") {
|
||||||
after = "R";
|
after = "R";
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (before === after) {
|
if (before === after) {
|
||||||
types.fill(before, i, end);
|
types.fill(before, i, end);
|
||||||
}
|
}
|
||||||
|
|||||||
96465
test/bidi/BidiCharacterTest.txt
Normal file
96465
test/bidi/BidiCharacterTest.txt
Normal file
File diff suppressed because it is too large
Load Diff
497591
test/bidi/BidiTest.txt
Normal file
497591
test/bidi/BidiTest.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -14,6 +14,70 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import { bidi } from "../../src/core/bidi.js";
|
import { bidi } from "../../src/core/bidi.js";
|
||||||
|
import { isNodeJS } from "../../src/shared/util.js";
|
||||||
|
|
||||||
|
const BIDI_TEST_DATA_PATH = isNodeJS ? "./test/bidi/" : "../bidi/";
|
||||||
|
|
||||||
|
async function readTestFile(filename) {
|
||||||
|
const path = BIDI_TEST_DATA_PATH + filename;
|
||||||
|
if (isNodeJS) {
|
||||||
|
const fs = process.getBuiltinModule("fs");
|
||||||
|
return fs.promises.readFile(path, "utf8");
|
||||||
|
}
|
||||||
|
const response = await fetch(new URL(path, window.location));
|
||||||
|
return response.text();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unicode Bidirectional Algorithm tests.
|
||||||
|
// Specification: https://www.unicode.org/reports/tr9/tr9-48.html
|
||||||
|
// Test data: https://www.unicode.org/Public/UCD/latest/ucd/BidiTest.txt
|
||||||
|
// https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
|
||||||
|
|
||||||
|
// Embedding/isolate code points not handled by bidi.js (X1-X10 rules skipped).
|
||||||
|
const EMBEDDING_CODEPOINTS = new Set([
|
||||||
|
0x202a, // LRE
|
||||||
|
0x202b, // RLE
|
||||||
|
0x202c, // PDF
|
||||||
|
0x202d, // LRO
|
||||||
|
0x202e, // RLO
|
||||||
|
0x2066, // LRI
|
||||||
|
0x2067, // RLI
|
||||||
|
0x2068, // FSI
|
||||||
|
0x2069, // PDI
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Bidi paired bracket code points that require the N0 rule (not implemented).
|
||||||
|
const BRACKET_CODEPOINTS = new Set([
|
||||||
|
0x0028,
|
||||||
|
0x0029, // ( )
|
||||||
|
0x005b,
|
||||||
|
0x005d, // [ ]
|
||||||
|
0x007b,
|
||||||
|
0x007d, // { }
|
||||||
|
0x2329,
|
||||||
|
0x232a, // 〈 〉
|
||||||
|
0x3008,
|
||||||
|
0x3009, // ⟨ ⟩
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Bidi classes not handled by bidi.js:
|
||||||
|
// - Embeddings/isolates: X1-X10 rules are skipped.
|
||||||
|
// - BN: boundary neutrals, treated as invisible by X9 (skipped).
|
||||||
|
// - S/B: segment/paragraph separators, level reset by L1 (skipped).
|
||||||
|
const UNSUPPORTED_BIDI_CLASSES = new Set([
|
||||||
|
"LRE",
|
||||||
|
"RLE",
|
||||||
|
"LRO",
|
||||||
|
"RLO",
|
||||||
|
"PDF",
|
||||||
|
"LRI",
|
||||||
|
"RLI",
|
||||||
|
"FSI",
|
||||||
|
"PDI",
|
||||||
|
"BN",
|
||||||
|
"S",
|
||||||
|
"B",
|
||||||
|
]);
|
||||||
|
|
||||||
describe("bidi", function () {
|
describe("bidi", function () {
|
||||||
it(
|
it(
|
||||||
@ -68,4 +132,174 @@ describe("bidi", function () {
|
|||||||
expect(bidiText.dir).toEqual("rtl");
|
expect(bidiText.dir).toEqual("rtl");
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
it("should reorder characters correctly per BidiCharacterTest.txt", async function () {
|
||||||
|
const content = await readTestFile("BidiCharacterTest.txt");
|
||||||
|
|
||||||
|
const failingLines = [];
|
||||||
|
let total = 0;
|
||||||
|
|
||||||
|
for (const [lineIndex, line] of content.split("\n").entries()) {
|
||||||
|
const trimmed = line.trim();
|
||||||
|
if (!trimmed || trimmed.startsWith("#")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const fields = trimmed.split(";");
|
||||||
|
if (fields.length < 5) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Field 1: paragraph direction.
|
||||||
|
// 0 = LTR, 1 = RTL, 2 = auto-LTR (P2/P3).
|
||||||
|
// Skip auto-LTR: bidi.js uses a 30%-threshold instead of P2/P3.
|
||||||
|
const paragraphDir = parseInt(fields[1].trim(), 10);
|
||||||
|
if (paragraphDir === 2) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Field 3: resolved levels; "x" marks characters removed by rule X9.
|
||||||
|
// Skip those cases since bidi.js does not implement X9.
|
||||||
|
if (fields[3].trim().split(/\s+/).includes("x")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const codePoints = fields[0]
|
||||||
|
.trim()
|
||||||
|
.split(/\s+/)
|
||||||
|
.map(cp => parseInt(cp, 16));
|
||||||
|
|
||||||
|
// Skip cases with embedding/isolate code points (X1-X10 not implemented).
|
||||||
|
if (codePoints.some(cp => EMBEDDING_CODEPOINTS.has(cp))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip cases containing bidi paired brackets (N0 rule not implemented).
|
||||||
|
if (codePoints.some(cp => BRACKET_CODEPOINTS.has(cp))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const str = String.fromCodePoint(...codePoints);
|
||||||
|
// Use spread to safely iterate by code point (handles surrogates).
|
||||||
|
const chars = [...str];
|
||||||
|
|
||||||
|
// Field 4: visual ordering indices (left to right).
|
||||||
|
const orderStr = fields[4].trim();
|
||||||
|
const order = orderStr ? orderStr.split(/\s+/).map(Number) : [];
|
||||||
|
const expectedStr = order.map(i => chars[i]).join("");
|
||||||
|
|
||||||
|
// paragraphDir 0 → startLevel 0 (LTR), 1 → startLevel 1 (RTL).
|
||||||
|
const result = bidi(str, paragraphDir, false);
|
||||||
|
|
||||||
|
total++;
|
||||||
|
if (result.str !== expectedStr) {
|
||||||
|
failingLines.push(lineIndex + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(total).toBeGreaterThan(0);
|
||||||
|
expect(failingLines).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reorder characters correctly per BidiTest.txt", async function () {
|
||||||
|
const content = await readTestFile("BidiTest.txt");
|
||||||
|
|
||||||
|
// Map each bidi class to a representative character recognized by bidi.js.
|
||||||
|
const BIDI_CLASS_TO_CHAR = {
|
||||||
|
L: "a", // U+0061
|
||||||
|
R: "\u05D0", // Hebrew Alef
|
||||||
|
AL: "\u0627", // Arabic Alef
|
||||||
|
AN: "\u0660", // Arabic-Indic Digit Zero
|
||||||
|
EN: "1", // U+0031
|
||||||
|
ES: "+", // U+002B
|
||||||
|
ET: "#", // U+0023
|
||||||
|
CS: ",", // U+002C
|
||||||
|
NSM: "\u0610", // Arabic combining mark (NSM in arabicTypes)
|
||||||
|
B: "\n", // U+000A paragraph separator
|
||||||
|
S: "\t", // U+0009 segment separator
|
||||||
|
WS: " ", // U+0020
|
||||||
|
ON: "!", // U+0021
|
||||||
|
};
|
||||||
|
|
||||||
|
let currentLevels = null;
|
||||||
|
let currentReorder = null;
|
||||||
|
const failingLines = [];
|
||||||
|
let total = 0;
|
||||||
|
|
||||||
|
for (const [lineIndex, line] of content.split("\n").entries()) {
|
||||||
|
const trimmed = line.trim();
|
||||||
|
if (!trimmed || trimmed.startsWith("#")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (trimmed.startsWith("@Levels:")) {
|
||||||
|
currentLevels = trimmed.slice("@Levels:".length).trim().split(/\s+/);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (trimmed.startsWith("@Reorder:")) {
|
||||||
|
const reorderStr = trimmed.slice("@Reorder:".length).trim();
|
||||||
|
currentReorder = reorderStr ? reorderStr.split(/\s+/).map(Number) : [];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ignore other @ directives (forward compatibility).
|
||||||
|
if (trimmed.startsWith("@")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!currentLevels || !currentReorder) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip cases where rule X9 removes characters (bidi.js omits X9).
|
||||||
|
if (currentLevels.includes("x")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const semicolonIdx = trimmed.indexOf(";");
|
||||||
|
if (semicolonIdx === -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const classes = trimmed.slice(0, semicolonIdx).trim().split(/\s+/);
|
||||||
|
const bitset = parseInt(trimmed.slice(semicolonIdx + 1).trim(), 10);
|
||||||
|
|
||||||
|
// Skip cases that involve unsupported bidi classes.
|
||||||
|
if (classes.some(c => UNSUPPORTED_BIDI_CLASSES.has(c))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip cases with no RTL character: bidi.js returns the input unchanged
|
||||||
|
// when no R/AL/AN is present, regardless of paragraph direction.
|
||||||
|
if (!classes.some(c => c === "R" || c === "AL" || c === "AN")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chars = classes.map(c => BIDI_CLASS_TO_CHAR[c]);
|
||||||
|
if (chars.includes(undefined)) {
|
||||||
|
continue; // Unknown class, skip.
|
||||||
|
}
|
||||||
|
const str = chars.join("");
|
||||||
|
const expectedStr = currentReorder.map(i => chars[i]).join("");
|
||||||
|
|
||||||
|
// Test explicit LTR (bit 1) and RTL (bit 2) paragraph directions.
|
||||||
|
// Skip auto-LTR (bit 0): bidi.js uses a threshold instead of P2/P3.
|
||||||
|
if (bitset & 2) {
|
||||||
|
total++;
|
||||||
|
if (bidi(str, 0, false).str !== expectedStr) {
|
||||||
|
failingLines.push(lineIndex + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (bitset & 4) {
|
||||||
|
total++;
|
||||||
|
if (bidi(str, 1, false).str !== expectedStr) {
|
||||||
|
failingLines.push(lineIndex + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(total).toBeGreaterThan(0);
|
||||||
|
expect(failingLines).toEqual([]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user