Merge pull request #20976 from calixteman/bidi_tests

Add the bidi tests coming from BidiTest.txt and BidiCharacterTest.txt
This commit is contained in:
calixteman 2026-03-26 21:36:18 +01:00 committed by GitHub
commit 484518614d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 594320 additions and 10 deletions

View File

@ -15,6 +15,9 @@
import { warn } from "../shared/util.js";
// Implements a subset of the Unicode Bidirectional Algorithm (UBA).
// Specification: https://www.unicode.org/reports/tr9/tr9-48.html
// Character types for symbols from 0000 to 00FF.
// Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
// prettier-ignore
@ -297,25 +300,42 @@ function bidi(str, startLevel = -1, vertical = false) {
text if the text on both sides has the same direction. European and Arabic
numbers are treated as though they were R. Start-of-level-run (sor) and
end-of-level-run (eor) are used at level run boundaries.
See https://www.unicode.org/reports/tr9/tr9-48.html#N1
*/
for (i = 0; i < strLength; ++i) {
if (types[i] === "ON") {
const end = findUnequal(types, i + 1, "ON");
// Scan left past non-strong types to find the nearest strong context
// (L, R, EN, or AN), falling back to sor at the level-run boundary.
let before = sor;
if (i > 0) {
before = types[i - 1];
for (let j = i - 1; j >= 0; j--) {
const tt = types[j];
if (tt === "L") {
before = "L";
break;
}
if (tt === "R" || tt === "EN" || tt === "AN") {
before = "R";
break;
}
}
// Scan right past non-strong types to find the nearest strong context,
// falling back to eor at the level-run boundary.
let after = eor;
if (end + 1 < strLength) {
after = types[end + 1];
}
if (before !== "L") {
before = "R";
}
if (after !== "L") {
after = "R";
for (let j = end; j < strLength; j++) {
const tt = types[j];
if (tt === "L") {
after = "L";
break;
}
if (tt === "R" || tt === "EN" || tt === "AN") {
after = "R";
break;
}
}
if (before === after) {
types.fill(before, i, end);
}

File diff suppressed because it is too large Load Diff

497591
test/bidi/BidiTest.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,70 @@
*/
import { bidi } from "../../src/core/bidi.js";
import { isNodeJS } from "../../src/shared/util.js";
const BIDI_TEST_DATA_PATH = isNodeJS ? "./test/bidi/" : "../bidi/";
async function readTestFile(filename) {
const path = BIDI_TEST_DATA_PATH + filename;
if (isNodeJS) {
const fs = process.getBuiltinModule("fs");
return fs.promises.readFile(path, "utf8");
}
const response = await fetch(new URL(path, window.location));
return response.text();
}
// Unicode Bidirectional Algorithm tests.
// Specification: https://www.unicode.org/reports/tr9/tr9-48.html
// Test data: https://www.unicode.org/Public/UCD/latest/ucd/BidiTest.txt
// https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
// Embedding/isolate code points not handled by bidi.js (X1-X10 rules skipped).
const EMBEDDING_CODEPOINTS = new Set([
0x202a, // LRE
0x202b, // RLE
0x202c, // PDF
0x202d, // LRO
0x202e, // RLO
0x2066, // LRI
0x2067, // RLI
0x2068, // FSI
0x2069, // PDI
]);
// Bidi paired bracket code points that require the N0 rule (not implemented).
const BRACKET_CODEPOINTS = new Set([
0x0028,
0x0029, // ( )
0x005b,
0x005d, // [ ]
0x007b,
0x007d, // { }
0x2329,
0x232a, // 〈 〉
0x3008,
0x3009, // ⟨ ⟩
]);
// Bidi classes not handled by bidi.js:
// - Embeddings/isolates: X1-X10 rules are skipped.
// - BN: boundary neutrals, treated as invisible by X9 (skipped).
// - S/B: segment/paragraph separators, level reset by L1 (skipped).
const UNSUPPORTED_BIDI_CLASSES = new Set([
"LRE",
"RLE",
"LRO",
"RLO",
"PDF",
"LRI",
"RLI",
"FSI",
"PDI",
"BN",
"S",
"B",
]);
describe("bidi", function () {
it(
@ -68,4 +132,174 @@ describe("bidi", function () {
expect(bidiText.dir).toEqual("rtl");
}
);
it("should reorder characters correctly per BidiCharacterTest.txt", async function () {
const content = await readTestFile("BidiCharacterTest.txt");
const failingLines = [];
let total = 0;
for (const [lineIndex, line] of content.split("\n").entries()) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) {
continue;
}
const fields = trimmed.split(";");
if (fields.length < 5) {
continue;
}
// Field 1: paragraph direction.
// 0 = LTR, 1 = RTL, 2 = auto-LTR (P2/P3).
// Skip auto-LTR: bidi.js uses a 30%-threshold instead of P2/P3.
const paragraphDir = parseInt(fields[1].trim(), 10);
if (paragraphDir === 2) {
continue;
}
// Field 3: resolved levels; "x" marks characters removed by rule X9.
// Skip those cases since bidi.js does not implement X9.
if (fields[3].trim().split(/\s+/).includes("x")) {
continue;
}
const codePoints = fields[0]
.trim()
.split(/\s+/)
.map(cp => parseInt(cp, 16));
// Skip cases with embedding/isolate code points (X1-X10 not implemented).
if (codePoints.some(cp => EMBEDDING_CODEPOINTS.has(cp))) {
continue;
}
// Skip cases containing bidi paired brackets (N0 rule not implemented).
if (codePoints.some(cp => BRACKET_CODEPOINTS.has(cp))) {
continue;
}
const str = String.fromCodePoint(...codePoints);
// Use spread to safely iterate by code point (handles surrogates).
const chars = [...str];
// Field 4: visual ordering indices (left to right).
const orderStr = fields[4].trim();
const order = orderStr ? orderStr.split(/\s+/).map(Number) : [];
const expectedStr = order.map(i => chars[i]).join("");
// paragraphDir 0 → startLevel 0 (LTR), 1 → startLevel 1 (RTL).
const result = bidi(str, paragraphDir, false);
total++;
if (result.str !== expectedStr) {
failingLines.push(lineIndex + 1);
}
}
expect(total).toBeGreaterThan(0);
expect(failingLines).toEqual([]);
});
it("should reorder characters correctly per BidiTest.txt", async function () {
const content = await readTestFile("BidiTest.txt");
// Map each bidi class to a representative character recognized by bidi.js.
const BIDI_CLASS_TO_CHAR = {
L: "a", // U+0061
R: "\u05D0", // Hebrew Alef
AL: "\u0627", // Arabic Alef
AN: "\u0660", // Arabic-Indic Digit Zero
EN: "1", // U+0031
ES: "+", // U+002B
ET: "#", // U+0023
CS: ",", // U+002C
NSM: "\u0610", // Arabic combining mark (NSM in arabicTypes)
B: "\n", // U+000A paragraph separator
S: "\t", // U+0009 segment separator
WS: " ", // U+0020
ON: "!", // U+0021
};
let currentLevels = null;
let currentReorder = null;
const failingLines = [];
let total = 0;
for (const [lineIndex, line] of content.split("\n").entries()) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) {
continue;
}
if (trimmed.startsWith("@Levels:")) {
currentLevels = trimmed.slice("@Levels:".length).trim().split(/\s+/);
continue;
}
if (trimmed.startsWith("@Reorder:")) {
const reorderStr = trimmed.slice("@Reorder:".length).trim();
currentReorder = reorderStr ? reorderStr.split(/\s+/).map(Number) : [];
continue;
}
// Ignore other @ directives (forward compatibility).
if (trimmed.startsWith("@")) {
continue;
}
if (!currentLevels || !currentReorder) {
continue;
}
// Skip cases where rule X9 removes characters (bidi.js omits X9).
if (currentLevels.includes("x")) {
continue;
}
const semicolonIdx = trimmed.indexOf(";");
if (semicolonIdx === -1) {
continue;
}
const classes = trimmed.slice(0, semicolonIdx).trim().split(/\s+/);
const bitset = parseInt(trimmed.slice(semicolonIdx + 1).trim(), 10);
// Skip cases that involve unsupported bidi classes.
if (classes.some(c => UNSUPPORTED_BIDI_CLASSES.has(c))) {
continue;
}
// Skip cases with no RTL character: bidi.js returns the input unchanged
// when no R/AL/AN is present, regardless of paragraph direction.
if (!classes.some(c => c === "R" || c === "AL" || c === "AN")) {
continue;
}
const chars = classes.map(c => BIDI_CLASS_TO_CHAR[c]);
if (chars.includes(undefined)) {
continue; // Unknown class, skip.
}
const str = chars.join("");
const expectedStr = currentReorder.map(i => chars[i]).join("");
// Test explicit LTR (bit 1) and RTL (bit 2) paragraph directions.
// Skip auto-LTR (bit 0): bidi.js uses a threshold instead of P2/P3.
if (bitset & 2) {
total++;
if (bidi(str, 0, false).str !== expectedStr) {
failingLines.push(lineIndex + 1);
}
}
if (bitset & 4) {
total++;
if (bidi(str, 1, false).str !== expectedStr) {
failingLines.push(lineIndex + 1);
}
}
}
expect(total).toBeGreaterThan(0);
expect(failingLines).toEqual([]);
});
});