Merge pull request #20976 from calixteman/bidi_tests

Add the bidi tests coming from BidiTest.txt and BidiCharacterTest.txt
This commit is contained in:
calixteman 2026-03-26 21:36:18 +01:00 committed by GitHub
commit 484518614d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 594320 additions and 10 deletions

View File

@ -15,6 +15,9 @@
import { warn } from "../shared/util.js"; import { warn } from "../shared/util.js";
// Implements a subset of the Unicode Bidirectional Algorithm (UBA).
// Specification: https://www.unicode.org/reports/tr9/tr9-48.html
// Character types for symbols from 0000 to 00FF. // Character types for symbols from 0000 to 00FF.
// Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt // Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
// prettier-ignore // prettier-ignore
@ -297,25 +300,42 @@ function bidi(str, startLevel = -1, vertical = false) {
text if the text on both sides has the same direction. European and Arabic text if the text on both sides has the same direction. European and Arabic
numbers are treated as though they were R. Start-of-level-run (sor) and numbers are treated as though they were R. Start-of-level-run (sor) and
end-of-level-run (eor) are used at level run boundaries. end-of-level-run (eor) are used at level run boundaries.
See https://www.unicode.org/reports/tr9/tr9-48.html#N1
*/ */
for (i = 0; i < strLength; ++i) { for (i = 0; i < strLength; ++i) {
if (types[i] === "ON") { if (types[i] === "ON") {
const end = findUnequal(types, i + 1, "ON"); const end = findUnequal(types, i + 1, "ON");
// Scan left past non-strong types to find the nearest strong context
// (L, R, EN, or AN), falling back to sor at the level-run boundary.
let before = sor; let before = sor;
if (i > 0) { for (let j = i - 1; j >= 0; j--) {
before = types[i - 1]; const tt = types[j];
if (tt === "L") {
before = "L";
break;
}
if (tt === "R" || tt === "EN" || tt === "AN") {
before = "R";
break;
}
} }
// Scan right past non-strong types to find the nearest strong context,
// falling back to eor at the level-run boundary.
let after = eor; let after = eor;
if (end + 1 < strLength) { for (let j = end; j < strLength; j++) {
after = types[end + 1]; const tt = types[j];
} if (tt === "L") {
if (before !== "L") { after = "L";
before = "R"; break;
} }
if (after !== "L") { if (tt === "R" || tt === "EN" || tt === "AN") {
after = "R"; after = "R";
break;
}
} }
if (before === after) { if (before === after) {
types.fill(before, i, end); types.fill(before, i, end);
} }

File diff suppressed because it is too large Load Diff

497591
test/bidi/BidiTest.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,70 @@
*/ */
import { bidi } from "../../src/core/bidi.js"; import { bidi } from "../../src/core/bidi.js";
import { isNodeJS } from "../../src/shared/util.js";
const BIDI_TEST_DATA_PATH = isNodeJS ? "./test/bidi/" : "../bidi/";
async function readTestFile(filename) {
const path = BIDI_TEST_DATA_PATH + filename;
if (isNodeJS) {
const fs = process.getBuiltinModule("fs");
return fs.promises.readFile(path, "utf8");
}
const response = await fetch(new URL(path, window.location));
return response.text();
}
// Unicode Bidirectional Algorithm tests.
// Specification: https://www.unicode.org/reports/tr9/tr9-48.html
// Test data: https://www.unicode.org/Public/UCD/latest/ucd/BidiTest.txt
// https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
// Embedding/isolate code points not handled by bidi.js (X1-X10 rules skipped).
const EMBEDDING_CODEPOINTS = new Set([
0x202a, // LRE
0x202b, // RLE
0x202c, // PDF
0x202d, // LRO
0x202e, // RLO
0x2066, // LRI
0x2067, // RLI
0x2068, // FSI
0x2069, // PDI
]);
// Bidi paired bracket code points that require the N0 rule (not implemented).
const BRACKET_CODEPOINTS = new Set([
0x0028,
0x0029, // ( )
0x005b,
0x005d, // [ ]
0x007b,
0x007d, // { }
0x2329,
0x232a, // 〈 〉
0x3008,
0x3009, // ⟨ ⟩
]);
// Bidi classes not handled by bidi.js:
// - Embeddings/isolates: X1-X10 rules are skipped.
// - BN: boundary neutrals, treated as invisible by X9 (skipped).
// - S/B: segment/paragraph separators, level reset by L1 (skipped).
const UNSUPPORTED_BIDI_CLASSES = new Set([
"LRE",
"RLE",
"LRO",
"RLO",
"PDF",
"LRI",
"RLI",
"FSI",
"PDI",
"BN",
"S",
"B",
]);
describe("bidi", function () { describe("bidi", function () {
it( it(
@ -68,4 +132,174 @@ describe("bidi", function () {
expect(bidiText.dir).toEqual("rtl"); expect(bidiText.dir).toEqual("rtl");
} }
); );
it("should reorder characters correctly per BidiCharacterTest.txt", async function () {
const content = await readTestFile("BidiCharacterTest.txt");
const failingLines = [];
let total = 0;
for (const [lineIndex, line] of content.split("\n").entries()) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) {
continue;
}
const fields = trimmed.split(";");
if (fields.length < 5) {
continue;
}
// Field 1: paragraph direction.
// 0 = LTR, 1 = RTL, 2 = auto-LTR (P2/P3).
// Skip auto-LTR: bidi.js uses a 30%-threshold instead of P2/P3.
const paragraphDir = parseInt(fields[1].trim(), 10);
if (paragraphDir === 2) {
continue;
}
// Field 3: resolved levels; "x" marks characters removed by rule X9.
// Skip those cases since bidi.js does not implement X9.
if (fields[3].trim().split(/\s+/).includes("x")) {
continue;
}
const codePoints = fields[0]
.trim()
.split(/\s+/)
.map(cp => parseInt(cp, 16));
// Skip cases with embedding/isolate code points (X1-X10 not implemented).
if (codePoints.some(cp => EMBEDDING_CODEPOINTS.has(cp))) {
continue;
}
// Skip cases containing bidi paired brackets (N0 rule not implemented).
if (codePoints.some(cp => BRACKET_CODEPOINTS.has(cp))) {
continue;
}
const str = String.fromCodePoint(...codePoints);
// Use spread to safely iterate by code point (handles surrogates).
const chars = [...str];
// Field 4: visual ordering indices (left to right).
const orderStr = fields[4].trim();
const order = orderStr ? orderStr.split(/\s+/).map(Number) : [];
const expectedStr = order.map(i => chars[i]).join("");
// paragraphDir 0 → startLevel 0 (LTR), 1 → startLevel 1 (RTL).
const result = bidi(str, paragraphDir, false);
total++;
if (result.str !== expectedStr) {
failingLines.push(lineIndex + 1);
}
}
expect(total).toBeGreaterThan(0);
expect(failingLines).toEqual([]);
});
it("should reorder characters correctly per BidiTest.txt", async function () {
const content = await readTestFile("BidiTest.txt");
// Map each bidi class to a representative character recognized by bidi.js.
const BIDI_CLASS_TO_CHAR = {
L: "a", // U+0061
R: "\u05D0", // Hebrew Alef
AL: "\u0627", // Arabic Alef
AN: "\u0660", // Arabic-Indic Digit Zero
EN: "1", // U+0031
ES: "+", // U+002B
ET: "#", // U+0023
CS: ",", // U+002C
NSM: "\u0610", // Arabic combining mark (NSM in arabicTypes)
B: "\n", // U+000A paragraph separator
S: "\t", // U+0009 segment separator
WS: " ", // U+0020
ON: "!", // U+0021
};
let currentLevels = null;
let currentReorder = null;
const failingLines = [];
let total = 0;
for (const [lineIndex, line] of content.split("\n").entries()) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) {
continue;
}
if (trimmed.startsWith("@Levels:")) {
currentLevels = trimmed.slice("@Levels:".length).trim().split(/\s+/);
continue;
}
if (trimmed.startsWith("@Reorder:")) {
const reorderStr = trimmed.slice("@Reorder:".length).trim();
currentReorder = reorderStr ? reorderStr.split(/\s+/).map(Number) : [];
continue;
}
// Ignore other @ directives (forward compatibility).
if (trimmed.startsWith("@")) {
continue;
}
if (!currentLevels || !currentReorder) {
continue;
}
// Skip cases where rule X9 removes characters (bidi.js omits X9).
if (currentLevels.includes("x")) {
continue;
}
const semicolonIdx = trimmed.indexOf(";");
if (semicolonIdx === -1) {
continue;
}
const classes = trimmed.slice(0, semicolonIdx).trim().split(/\s+/);
const bitset = parseInt(trimmed.slice(semicolonIdx + 1).trim(), 10);
// Skip cases that involve unsupported bidi classes.
if (classes.some(c => UNSUPPORTED_BIDI_CLASSES.has(c))) {
continue;
}
// Skip cases with no RTL character: bidi.js returns the input unchanged
// when no R/AL/AN is present, regardless of paragraph direction.
if (!classes.some(c => c === "R" || c === "AL" || c === "AN")) {
continue;
}
const chars = classes.map(c => BIDI_CLASS_TO_CHAR[c]);
if (chars.includes(undefined)) {
continue; // Unknown class, skip.
}
const str = chars.join("");
const expectedStr = currentReorder.map(i => chars[i]).join("");
// Test explicit LTR (bit 1) and RTL (bit 2) paragraph directions.
// Skip auto-LTR (bit 0): bidi.js uses a threshold instead of P2/P3.
if (bitset & 2) {
total++;
if (bidi(str, 0, false).str !== expectedStr) {
failingLines.push(lineIndex + 1);
}
}
if (bitset & 4) {
total++;
if (bidi(str, 1, false).str !== expectedStr) {
failingLines.push(lineIndex + 1);
}
}
}
expect(total).toBeGreaterThan(0);
expect(failingLines).toEqual([]);
});
}); });