author | André Bargull <andre.bargull@gmail.com> |
Fri, 11 Oct 2019 19:25:32 +0000 | |
changeset 497319 | 7e272f3c9fa48620151dbdd970802be7d3c0271e |
parent 497318 | f22fdbd968ed04fedd47c406db2678fa5ae1205e |
child 497320 | e7dbd9ac5b7ddbc9fbad5386f1814fbce72be383 |
push id | 36682 |
push user | ncsoregi@mozilla.com |
push date | Sat, 12 Oct 2019 09:52:03 +0000 |
treeherder | mozilla-central@06ea2371f897 [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
reviewers | jwalden |
bugs | 1570370 |
milestone | 71.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
--- a/js/src/builtin/String.js +++ b/js/src/builtin/String.js @@ -614,17 +614,17 @@ function String_toLocaleLowerCase() { // argument) first. var locales = arguments.length > 0 ? arguments[0] : undefined; var requestedLocale; if (locales === undefined) { // Steps 3, 6. requestedLocale = undefined; } else if (typeof locales === "string") { // Steps 3, 5. - requestedLocale = ValidateAndCanonicalizeLanguageTag(locales); + requestedLocale = intl_ValidateAndCanonicalizeLanguageTag(locales, false); } else { // Step 3. var requestedLocales = CanonicalizeLocaleList(locales); // Steps 4-6. requestedLocale = requestedLocales.length > 0 ? requestedLocales[0] : undefined; } @@ -655,17 +655,17 @@ function String_toLocaleUpperCase() { // argument) first. var locales = arguments.length > 0 ? arguments[0] : undefined; var requestedLocale; if (locales === undefined) { // Steps 3, 6. requestedLocale = undefined; } else if (typeof locales === "string") { // Steps 3, 5. - requestedLocale = ValidateAndCanonicalizeLanguageTag(locales); + requestedLocale = intl_ValidateAndCanonicalizeLanguageTag(locales, false); } else { // Step 3. var requestedLocales = CanonicalizeLocaleList(locales); // Steps 4-6. requestedLocale = requestedLocales.length > 0 ? requestedLocales[0] : undefined; }
--- a/js/src/builtin/intl/CommonFunctions.js +++ b/js/src/builtin/intl/CommonFunctions.js @@ -2,21 +2,20 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* Portions Copyright Norbert Lindenberg 2011-2012. */ #ifdef DEBUG #define assertIsValidAndCanonicalLanguageTag(locale, desc) \ do { \ - let localeObj = parseLanguageTag(locale); \ - assert(localeObj !== null, \ + let canonical = intl_TryValidateAndCanonicalizeLanguageTag(locale); \ + assert(canonical !== null, \ `${desc} is a structurally valid language tag`); \ - CanonicalizeLanguageTagObject(localeObj); \ - assert(StringFromLanguageTagObject(localeObj) === locale, \ + assert(canonical === locale, \ `${desc} is a canonicalized language tag`); \ } while (false) #else #define assertIsValidAndCanonicalLanguageTag(locale, desc) ; // Elided assertion. #endif /** * Returns the start index of a "Unicode locale extension sequence", which the @@ -114,1099 +113,30 @@ function getUnicodeExtensions(locale) { var start = startOfUnicodeExtensions(locale); assert(start >= 0, "start of Unicode extension sequence not found"); var end = endOfUnicodeExtensions(locale, start); return Substring(locale, start, end - start); } -// The three possible token type bits. Expressed as #defines to avoid -// extra named lookups in the interpreter/jits. -#define NONE 0b00 -#define ALPHA 0b01 -#define DIGIT 0b10 - -// Constants for code units used below. -#define HYPHEN 0x2D -#define DIGIT_ZERO 0x30 -#define DIGIT_NINE 0x39 -#define UPPER_A 0x41 -#define UPPER_Z 0x5A -#define LOWER_A 0x61 -#define LOWER_T 0x74 -#define LOWER_U 0x75 -#define LOWER_X 0x78 -#define LOWER_Z 0x7A - -// The requirement to use callFunction() for method calls makes the parser -// harder to read. Use macros for the rescue. - -// Reads the next token. -#define NEXT_TOKEN_OR_RETURN_NULL(ts) \ - if (!callFunction(ts.nextToken, ts)) \ - return null; - -#ifdef DEBUG -#define NEXT_TOKEN_OR_ASSERT(ts) \ - if (!callFunction(ts.nextToken, ts)) \ - assert(false, "unexpected invalid subtag"); -#else -#define NEXT_TOKEN_OR_ASSERT(ts) \ - callFunction(ts.nextToken, ts); -#endif - -// Assigns the current subtag part transformed to lower-case to the target. -#define SUBTAG_VAR_OR_RETURN_NULL(ts, target) \ - { \ - target = Substring(ts.localeLowercase, ts.tokenStart, ts.tokenLength); \ - NEXT_TOKEN_OR_RETURN_NULL(ts); \ - } - -// Assigns the current subtag part transformed to lower-case to the target. -#define SUBTAG_VAR_OR_ASSERT(ts, target) \ - { \ - target = Substring(ts.localeLowercase, ts.tokenStart, ts.tokenLength); \ - NEXT_TOKEN_OR_ASSERT(ts) \ - } - -/** - * Tokenizer for Unicode BCP 47 locale identifiers. - */ -function BCP47TokenStream(locale) { - this.locale = locale; - - // Locale identifiers are compared and processed case-insensitively, so - // technically it's not necessary to adjust case. But for easier processing, - // and because the canonical form for most subtags is lower case, we start - // with lower case for all. - // - // Note that the tokenizer function keeps using the original input string - // to properly detect non-ASCII characters. The lower-case string can't be - // used to detect those characters, because some non-ASCII characters - // lower-case map into ASCII characters, e.g. U+212A (KELVIN SIGN) lower- - // case maps to U+006B (LATIN SMALL LETTER K). - this.localeLowercase = callFunction(std_String_toLowerCase, locale); - - // Current parse index in |locale|. - this.index = 0; - - // The current token type, its start index, and its length. - this.token = NONE; - this.tokenStart = 0; - this.tokenLength = 0; - - assert(std_String_fromCharCode(HYPHEN) === "-" && - std_String_fromCharCode(DIGIT_ZERO) === "0" && - std_String_fromCharCode(DIGIT_NINE) === "9" && - std_String_fromCharCode(UPPER_A) === "A" && - std_String_fromCharCode(UPPER_Z) === "Z" && - std_String_fromCharCode(LOWER_A) === "a" && - std_String_fromCharCode(LOWER_T) === "t" && - std_String_fromCharCode(LOWER_U) === "u" && - std_String_fromCharCode(LOWER_X) === "x" && - std_String_fromCharCode(LOWER_Z) === "z", - "code unit constants should match the expected characters"); -} - -MakeConstructible(BCP47TokenStream, { - __proto__: null, - - // Reads the next token, returns |false| if an illegal character was found, - // otherwise returns |true|. - // - // eslint-disable-next-line object-shorthand - nextToken: function() { - var type = NONE; - var {index, locale} = this; - for (var i = index; i < locale.length; i++) { - // UTS 35, section 3.1. - // alpha = [A-Z a-z] ; - // digit = [0-9] ; - var c = callFunction(std_String_charCodeAt, locale, i); - if ((UPPER_A <= c && c <= UPPER_Z) || (LOWER_A <= c && c <= LOWER_Z)) - type |= ALPHA; - else if (DIGIT_ZERO <= c && c <= DIGIT_NINE) - type |= DIGIT; - else if (c === HYPHEN && i > index && i + 1 < locale.length) - break; - else - return false; - } - - this.token = type; - this.tokenStart = index; - this.tokenLength = i - index; - this.index = i + 1; - return true; - }, - - // Returns true if the character at the requested index within the current - // token is a digit. - // - // eslint-disable-next-line object-shorthand - isDigitAt: function(index) { - assert(0 <= index && index < this.tokenLength, - "must be an index into the current token"); - var c = callFunction(std_String_charCodeAt, this.localeLowercase, this.tokenStart + index); - assert(!(c <= DIGIT_NINE) || c >= DIGIT_ZERO, - "token-start-code-unit <= '9' implies token-start-code-unit is in '0'..'9' " + - "and because all digits are sorted before any letters"); - return c <= DIGIT_NINE; - }, - - // Returns the code unit of the first character at the current token - // position. Always returns the lower-case form of an alphabetical - // character. - // - // eslint-disable-next-line object-shorthand - singletonKey: function() { - assert(this.tokenLength === 1, "token is not a singleton"); - var c = callFunction(std_String_charCodeAt, this.localeLowercase, this.tokenStart); - assert((DIGIT_ZERO <= c && c <= DIGIT_NINE) || (LOWER_A <= c && c <= LOWER_Z), - "unexpected code unit"); - return c; - }, - - // eslint-disable-next-line object-shorthand - singletonValue: function() { - var singletonStart = this.tokenStart; - var min = callFunction(this.singletonKey, this) === LOWER_X ? 1 : 2; - - NEXT_TOKEN_OR_RETURN_NULL(this); - - // At least one non-singleton subtag must be present. - if (!(min <= this.tokenLength && this.tokenLength <= 8)) - return null; - do { - NEXT_TOKEN_OR_RETURN_NULL(this); - } while (min <= this.tokenLength && this.tokenLength <= 8); - - return callFunction(this.singletonValueAt, this, singletonStart); - }, - - // eslint-disable-next-line object-shorthand - singletonValueAt: function(start) { - // Singletons must be followed by a non-singleton subtag, "en-a-b" is not allowed. - var length = this.tokenStart - 1 - start; - if (length <= 2) - return null; - return Substring(this.localeLowercase, start, length); - } -}); - -/* eslint-disable complexity */ -/** - * Parser for Unicode BCP 47 locale identifiers. - * - * Returns null if |locale| can't be parsed as a `unicode_locale_id`. If the - * input is a grandfathered language tag, it is directly canonicalized to its - * modern form. The returned object has the following structure: - * - * { - * language: `unicode_language_subtag`, - * script: `unicode_script_subtag` / undefined, - * region: `unicode_region_subtag` / undefined, - * variants: array of `unicode_variant_subtag`, - * extensions: array of `extensions`, - * privateuse: `pu_extensions` / undefined, - * } - * - * All locale identifier subtags are returned in their normalized case: - * - * var langtag = parseLanguageTag("en-latn-us"); - * assertEq("en", langtag.language); - * assertEq("Latn", langtag.script); - * assertEq("US", langtag.region); - * - * Spec: https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers - */ -function parseLanguageTag(locale) { - assert(typeof locale === "string", "locale is a string"); - - // unicode_locale_id = unicode_language_id - // extensions* - // pu_extensions? ; - var ts = new BCP47TokenStream(locale); - NEXT_TOKEN_OR_RETURN_NULL(ts); - - var language, script, region, privateuse; - var variants = []; - var extensions = []; - - // unicode_language_id = unicode_language_subtag - // (sep unicode_script_subtag)? - // (sep unicode_region_subtag)? - // (sep unicode_variant_subtag)* ; - // - // sep = "-" - // - // Note: Unicode CLDR locale identifier backward compatibility extensions - // removed from `unicode_language_id`. - - // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; - if (ts.token !== ALPHA || ts.tokenLength === 1 || ts.tokenLength === 4 || ts.tokenLength > 8) { - // Four character language subtags are not allowed in Unicode BCP 47 - // locale identifiers. Also see the comparison to Unicode CLDR locale - // identifiers in <https://unicode.org/reports/tr35/#BCP_47_Conformance>. - return null; - } - assert((2 <= ts.tokenLength && ts.tokenLength <= 3) || - (5 <= ts.tokenLength && ts.tokenLength <= 8), - "language subtags have 2-3 or 5-8 letters"); - - SUBTAG_VAR_OR_RETURN_NULL(ts, language); - - // unicode_script_subtag = alpha{4} ; - if (ts.tokenLength === 4 && ts.token === ALPHA) { - SUBTAG_VAR_OR_RETURN_NULL(ts, script); - - // The first character of a script code needs to be capitalized. - // "hans" -> "Hans" - script = callFunction(std_String_toUpperCase, script[0]) + - Substring(script, 1, script.length - 1); - } - - // unicode_region_subtag = (alpha{2} | digit{3}) ; - if ((ts.tokenLength === 2 && ts.token === ALPHA) || - (ts.tokenLength === 3 && ts.token === DIGIT)) - { - SUBTAG_VAR_OR_RETURN_NULL(ts, region); - - // Region codes need to be in upper-case. "bu" -> "BU" - region = callFunction(std_String_toUpperCase, region); - } - - // unicode_variant_subtag = (alphanum{5,8} - // | digit alphanum{3}) ; - // - // alphanum = [0-9 A-Z a-z] ; - while ((5 <= ts.tokenLength && ts.tokenLength <= 8) || - (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0))) - { - // Locale identifiers are case insensitive (UTS 35, section 3.2). - // All seen variants are compared ignoring case differences by - // using the lower-case form. This allows to properly detect and - // reject variant repetitions with differing case, e.g. - // "en-variant-Variant". - var variant; - SUBTAG_VAR_OR_RETURN_NULL(ts, variant); - - // Reject the Locale identifier if a duplicate variant was found. - // - // This linear-time verification step means the whole variant - // subtag checking is potentially quadratic, but we're okay doing - // that because language tags are unlikely to be deliberately - // pathological. - if (callFunction(ArrayIndexOf, variants, variant) !== -1) - return null; - _DefineDataProperty(variants, variants.length, variant); - } - - // extensions = unicode_locale_extensions - // | transformed_extensions - // | other_extensions ; - // - // unicode_locale_extensions = sep [uU] - // ((sep keyword)+ - // |(sep attribute)+ (sep keyword)*) ; - // - // transformed_extensions = sep [tT] - // ((sep tlang (sep tfield)*) - // |(sep tfield)+) ; - // - // other_extensions = [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; - // - // keyword = key (sep type)? ; - // - // key = alphanum alpha ; - // - // type = alphanum{3,8} (sep alphanum{3,8})* ; - // - // attribute = alphanum{3,8} ; - // - // tlang = unicode_language_subtag - // (sep unicode_script_subtag)? - // (sep unicode_region_subtag)? - // (sep unicode_variant_subtag)* ; - // - // tfield = tkey tvalue; - // - // tkey = alpha digit ; - // - // tvalue = (sep alphanum{3,8})+ ; - var seenSingletons = []; - while (ts.tokenLength === 1) { - var singleton = callFunction(ts.singletonKey, ts); - if (singleton === LOWER_X) - break; - - // Locale identifiers are case insensitive (UTS 35, section 3.2). - // Ensure |singletonKey()| does not return the code unit of an - // upper-case character, so we can properly detect and reject - // singletons with different case, e.g. "en-u-foo-U-foo". - assert(!(UPPER_A <= singleton && singleton <= UPPER_Z), - "unexpected upper-case code unit"); - - // Reject the input if a duplicate singleton was found. - // - // Similar to the variant validation step this check is O(n**2), - // but given that there are only 35 possible singletons the - // quadratic runtime is negligible. - if (callFunction(ArrayIndexOf, seenSingletons, singleton) !== -1) - return null; - _DefineDataProperty(seenSingletons, seenSingletons.length, singleton); - - var extension; - if (singleton === LOWER_U) { - var extensionStart = ts.tokenStart; - NEXT_TOKEN_OR_RETURN_NULL(ts); - - while (2 <= ts.tokenLength && ts.tokenLength <= 8) { - // `key` doesn't allow a digit as its second character. - if (ts.tokenLength === 2 && callFunction(ts.isDigitAt, ts, 1)) - return null; - NEXT_TOKEN_OR_RETURN_NULL(ts); - } - extension = callFunction(ts.singletonValueAt, ts, extensionStart); - } else if (singleton === LOWER_T) { - var extensionStart = ts.tokenStart; - NEXT_TOKEN_OR_RETURN_NULL(ts); - - // `tfield` starts with `tkey`, which in turn is `alpha digit`, so - // an alpha-only token must be a `tlang`. - if (ts.token === ALPHA) { - // `unicode_language_subtag` - if (ts.tokenLength === 1 || ts.tokenLength === 4 || ts.tokenLength > 8) - return null; - NEXT_TOKEN_OR_RETURN_NULL(ts); - - // `unicode_script_subtag` (optional) - if (ts.tokenLength === 4 && ts.token === ALPHA) { - NEXT_TOKEN_OR_RETURN_NULL(ts); - } - - // `unicode_region_subtag` (optional) - if ((ts.tokenLength === 2 && ts.token === ALPHA) || - (ts.tokenLength === 3 && ts.token === DIGIT)) - { - NEXT_TOKEN_OR_RETURN_NULL(ts); - } - - // `unicode_variant_subtag` (optional) - while ((5 <= ts.tokenLength && ts.tokenLength <= 8) || - (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0))) - { - NEXT_TOKEN_OR_RETURN_NULL(ts); - } - } - - // Trailing `tfield` subtags. - while (ts.tokenLength === 2) { - // `tkey` is `alpha digit`. - if (callFunction(ts.isDigitAt, ts, 0) || - !callFunction(ts.isDigitAt, ts, 1)) - { - return null; - } - NEXT_TOKEN_OR_RETURN_NULL(ts); - - // `tfield` requires at least one `tvalue`. - if (!(3 <= ts.tokenLength && ts.tokenLength <= 8)) - return null; - do { - NEXT_TOKEN_OR_RETURN_NULL(ts); - } while (3 <= ts.tokenLength && ts.tokenLength <= 8); - } - extension = callFunction(ts.singletonValueAt, ts, extensionStart); - } else { - extension = callFunction(ts.singletonValue, ts); - } - if (!extension) - return null; - - _DefineDataProperty(extensions, extensions.length, extension); - } - - // Trailing pu_extensions component of the unicode_locale_id production. - // - // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; - if (ts.tokenLength === 1 && callFunction(ts.singletonKey, ts) === LOWER_X) { - privateuse = callFunction(ts.singletonValue, ts); - if (!privateuse) - return null; - } - - // Reject the input if it couldn't be parsed completely. - if (ts.token !== NONE) - return null; - - var tagObj = { - language, - script, - region, - variants, - extensions, - privateuse, - }; - - // Handle grandfathered tags right away, so we don't need to have extra - // paths for grandfathered tags later on. - // - // grandfathered = "art-lojban" ; non-redundant tags registered - // / "cel-gaulish" ; during the RFC 3066 era - // / "zh-guoyu" ; these tags match the 'langtag' - // / "zh-hakka" ; production, but their subtags - // / "zh-xiang" ; are not extended language - // ; or variant subtags: their meaning - // ; is defined by their registration - // ; and all of these are deprecated - // ; in favor of a more modern - // ; subtag or sequence of subtags - if (hasOwn(ts.localeLowercase, grandfatheredMappings)) - updateGrandfatheredMappings(tagObj); - - // Return if the complete input was successfully parsed. - return tagObj; -} - -/** - * Return the locale and fields components of the given valid Transform - * extension subtag. - */ -function TransformExtensionComponents(extension) { - assert(typeof extension === "string", "extension is a String value"); - assert(callFunction(std_String_startsWith, extension, "t-"), - "extension starts with 't-'"); - - var ts = new BCP47TokenStream(Substring(extension, 2, extension.length - 2)); - NEXT_TOKEN_OR_ASSERT(ts); - - // `tfield` starts with `tkey`, which in turn is `alpha digit`, so - // an alpha-only token must be a `tlang`. - var localeObj; - if (ts.token === ALPHA) { - // `unicode_language_subtag` - assert((2 <= ts.tokenLength && ts.tokenLength <= 3) || - (5 <= ts.tokenLength && ts.tokenLength <= 8), - "language subtags have 2-3 or 5-8 letters"); - - var language; - SUBTAG_VAR_OR_ASSERT(ts, language); - - // unicode_script_subtag = alpha{4} ; - var script; - if (ts.tokenLength === 4 && ts.token === ALPHA) { - SUBTAG_VAR_OR_ASSERT(ts, script); - - // The first character of a script code needs to be capitalized. - // "hans" -> "Hans" - script = callFunction(std_String_toUpperCase, script[0]) + - Substring(script, 1, script.length - 1); - } - - // unicode_region_subtag = (alpha{2} | digit{3}) ; - var region; - if ((ts.tokenLength === 2 && ts.token === ALPHA) || - (ts.tokenLength === 3 && ts.token === DIGIT)) - { - SUBTAG_VAR_OR_ASSERT(ts, region); - - // Region codes need to be in upper-case. "bu" -> "BU" - region = callFunction(std_String_toUpperCase, region); - } - - // unicode_variant_subtag = (alphanum{5,8} - // | digit alphanum{3}) ; - // - // alphanum = [0-9 A-Z a-z] ; - var variants = []; - while ((5 <= ts.tokenLength && ts.tokenLength <= 8) || - (ts.tokenLength === 4 && callFunction(ts.isDigitAt, ts, 0))) - { - var variant; - SUBTAG_VAR_OR_ASSERT(ts, variant); - - _DefineDataProperty(variants, variants.length, variant); - } - - localeObj = { - language, - script, - region, - variants, - extensions: [], - privateuse: undefined, - }; - } - - // Trailing `tfield` subtags. (Any other trailing subtags are an error, - // because we're guaranteed to only see a valid tranform extension here.) - var fields = []; - while (ts.tokenLength === 2) { - // `tkey` is `alpha digit`. - assert(!callFunction(ts.isDigitAt, ts, 0) && callFunction(ts.isDigitAt, ts, 1), - "unexpected invalid tkey subtag"); - - var key; - SUBTAG_VAR_OR_ASSERT(ts, key); - - // `tfield` requires at least one `tvalue`. - assert(3 <= ts.tokenLength && ts.tokenLength <= 8, - "unexpected invalid tvalue subtag"); - - var value; - SUBTAG_VAR_OR_ASSERT(ts, value); - - while (3 <= ts.tokenLength && ts.tokenLength <= 8) { - var part; - SUBTAG_VAR_OR_ASSERT(ts, part); - value += "-" + part; - } - - _DefineDataProperty(fields, fields.length, {key, value}); - } - - assert(ts.token === NONE, - "unexpected trailing characters in promised-to-be-valid transform extension"); - - return {locale: localeObj, fields}; -} -/* eslint-enable complexity */ - -#undef NONE -#undef ALPHA -#undef DIGIT - -#undef HYPHEN -#undef DIGIT_ZERO -#undef DIGIT_NINE -#undef UPPER_A -#undef UPPER_Z -#undef LOWER_A -#undef LOWER_T -#undef LOWER_U -#undef LOWER_X -#undef LOWER_Z - -#undef SUBTAG_VAR_OR_ASSERT -#undef SUBTAG_VAR_OR_RETURN_NULL -#undef NEXT_TOKEN_OR_ASSERT -#undef NEXT_TOKEN_OR_RETURN_NULL - -/** - * Verifies that the given string is a well-formed BCP 47 language tag - * with no duplicate variant or singleton subtags. - * - * Spec: ECMAScript Internationalization API Specification, 6.2.2. - */ -function IsStructurallyValidLanguageTag(locale) { - return parseLanguageTag(locale) !== null; -} - -/** - * Canonicalizes the given structurally valid Unicode BCP 47 locale identifier, - * including regularized case of subtags. For example, the language tag - * Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, where - * - * Zh ; 2*3ALPHA - * -haNS ; ["-" script] - * -bu ; ["-" region] - * -variant2 ; *("-" variant) - * -Variant1 - * -u-ca-chinese ; *("-" extension) - * -t-Zh-laTN - * -x-PRIVATE ; ["-" privateuse] - * - * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private - * - * UTS 35 specifies two different canonicalization algorithms. There's one to - * canonicalize BCP 47 language tags and other one to canonicalize Unicode - * locale identifiers. The latter one wasn't present when ECMA-402 was changed - * to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags, so - * ECMA-402 currently only uses the former to canonicalize Unicode BCP 47 locale - * identifiers. - * - * Spec: ECMAScript Internationalization API Specification, 6.2.3. - * Spec: https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers - * Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion - */ -function CanonicalizeLanguageTagObject(localeObj) { - assert(IsObject(localeObj), "CanonicalizeLanguageTagObject"); - - // Per UTS 35, 3.3.1, the very first step is to canonicalize the syntax by - // normalizing the case and ordering all subtags. The canonical syntax form - // itself is specified in UTS 35, 3.2.1. - - // The parser already normalized the case for all subtags. - -#ifdef DEBUG - function IsLowerCase(s) { - return s === callFunction(std_String_toLowerCase, s); - } - function IsUpperCase(s) { - return s === callFunction(std_String_toUpperCase, s); - } - function IsTitleCase(s) { - assert(s.length > 0, "unexpected empy string"); - var r = callFunction(std_String_toUpperCase, s[0]) + - callFunction(std_String_toLowerCase, Substring(s, 1, s.length - 1)); - return s === r; - } -#endif - - // 1. Any script subtag is in title case. - assert(localeObj.script === undefined || IsTitleCase(localeObj.script), - "If present, script subtag is in title case"); - - // 2. Any region subtag is in uppercase. - assert(localeObj.region === undefined || IsUpperCase(localeObj.region), - "If present, region subtag is in upper case"); - - // 3. All other subtags are in lowercase. - assert(IsLowerCase(localeObj.language), - "language subtag is in lower case"); - assert(callFunction(ArrayEvery, localeObj.variants, IsLowerCase), - "variant subtags are in lower case"); - assert(callFunction(ArrayEvery, localeObj.extensions, IsLowerCase), - "extension subtags are in lower case"); - assert(localeObj.privateuse === undefined || IsLowerCase(localeObj.privateuse), - "If present, privateuse subtag is in lower case"); - - - // The second step in UTS 35, 3.2.1, is to order all subtags. - - // 1. Any variants are in alphabetical order. - var variants = localeObj.variants; - if (variants.length > 0) { - callFunction(ArraySort, variants); - } - - // 2. Any extensions are in alphabetical order by their singleton. - var extensions = localeObj.extensions; - if (extensions.length > 0) { - // Extension sequences are sorted by their singleton characters. - // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" - callFunction(ArraySort, extensions); - - // The last three bullet points in UTS 35, 3.2.1 apply only to Unicode and Transform - // extensions. - // - // 3. All attributes are sorted in alphabetical order. - // - // 4. All keywords and tfields are sorted by alphabetical order of their - // keys, within their respective extensions. - // - // 5. Any type or tfield value "true" is removed. - - for (var i = 0; i < extensions.length; i++) { - var ext = extensions[i]; - assert(IsLowerCase(ext), - "extension subtags must be in lower-case"); - assert(ext[1] === "-", - "extension subtags start with a singleton"); - - // Canonicalize Unicode locale extension subtag if present. - if (ext[0] === "u") { - var {attributes, keywords} = UnicodeExtensionComponents(ext); - extensions[i] = CanonicalizeUnicodeExtension(attributes, keywords, false); - } - - // Canonicalize Unicode BCP 47 T extension if present. - if (ext[0] === "t") { - var {locale, fields} = TransformExtensionComponents(ext); - extensions[i] = CanonicalizeTransformExtension(locale, fields); - } - } - } - - // The next two steps in 3.3.1 replace deprecated language and region - // subtags with their preferred mappings. - updateLocaleIdMappings(localeObj); - - // The two final steps in 3.3.1, handling irregular grandfathered and - // private-use only language tags, don't apply, because these two forms - // can't occur in Unicode BCP 47 locale identifiers. -} - -/** - * Intl.Locale proposal - * - * UnicodeExtensionComponents( extension ) - * - * Returns the components of |extension| where |extension| is a "Unicode locale - * extension sequence" (ECMA-402, 6.2.1) without the starting separator - * character. - */ -function UnicodeExtensionComponents(extension) { - assert(typeof extension === "string", "extension is a String value"); - - // Step 1. - var attributes = []; - - // Step 2. - var keywords = []; - - // Step 3. - var isKeyword = false; - - // Step 4. - var size = extension.length; - - // Step 5. - // |extension| starts with "u-" instead of "-u-" in our implementation, so - // we need to initialize |k| with 2 instead of 3. - assert(callFunction(std_String_startsWith, extension, "u-"), - "extension starts with 'u-'"); - var k = 2; - - // Step 6. - var key, value; - while (k < size) { - // Step 6.a. - var e = callFunction(std_String_indexOf, extension, "-", k); - - // Step 6.b. - var len = (e < 0 ? size : e) - k; - - // Step 6.c. - var subtag = Substring(extension, k, len); - - // Steps 6.d-e. - if (!isKeyword) { - // Step 6.d. - // NB: Duplicates are handled elsewhere in our implementation. - if (len !== 2) - _DefineDataProperty(attributes, attributes.length, subtag); - } else { - // Steps 6.e.i-ii. - if (len === 2) { - // Step 6.e.i.1. - // NB: Duplicates are handled elsewhere in our implementation. - _DefineDataProperty(keywords, keywords.length, {key, value}); - } else { - // Step 6.e.ii.1. - if (value !== "") - value += "-"; - - // Step 6.e.ii.2. - value += subtag; - } - } - - // Step 6.f. - if (len === 2) { - // Step 6.f.i. - isKeyword = true; - - // Step 6.f.ii. - key = subtag; - - // Step 6.f.iii. - value = ""; - } - - // Step 6.g. - k += len + 1; - } - - // Step 7. - if (isKeyword) { - // Step 7.a. - // NB: Duplicates are handled elsewhere in our implementation. - _DefineDataProperty(keywords, keywords.length, {key, value}); - } - - // Step 8. - return {attributes, keywords}; -} - -/** - * CanonicalizeUnicodeExtension( attributes, keywords ) - * - * Canonical syntax per <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: - * - * - All attributes and keywords are in lowercase. - * - Note: The parser already converted keywords to lowercase. - * - All attributes are sorted in alphabetical order. - * - All keywords are sorted by alphabetical order of their keys. - * - Any type value "true" is removed. - * - * Canonical form: - * - All keys and types use the canonical form (from the name attribute; - * see Section 3.6.4 U Extension Data Files). - */ -function CanonicalizeUnicodeExtension(attributes, keywords, canonicalForm) { - assert(attributes.length > 0 || keywords.length > 0, - "unexpected empty Unicode locale extension components"); - - // All attributes are sorted in alphabetical order. - if (attributes.length > 1) - callFunction(ArraySort, attributes); - - // All keywords are sorted by alphabetical order of keys. - if (keywords.length > 1) { - function UnicodeKeySort(left, right) { - var leftKey = left.key; - var rightKey = right.key; - assert(leftKey.length === 2, "left key is a Unicode key"); - assert(rightKey.length === 2, "right key is a Unicode key"); - - // Compare both strings using charCodeAt(), because relational - // string comparison always calls into the VM, whereas charCodeAt - // can be inlined by Ion. - var diff = callFunction(std_String_charCodeAt, leftKey, 0) - - callFunction(std_String_charCodeAt, rightKey, 0); - if (diff === 0) { - diff = callFunction(std_String_charCodeAt, leftKey, 1) - - callFunction(std_String_charCodeAt, rightKey, 1); - } - return diff; - } - - callFunction(ArraySort, keywords, UnicodeKeySort); - } - - var extension = "u"; - - // Append all attributes. - for (var i = 0; i < attributes.length; i++) { - var attribute = attributes[i]; - assert(attribute === callFunction(std_String_toLowerCase, attribute), - "Attributes are already canonicalized to lower case"); - - // UnicodeExtensionComponents ignores duplicate attributes. - if (canonicalForm && i > 0 && attributes[i - 1] === attribute) { - continue; - } - - extension += "-" + attributes[i]; - } - - // Append all keywords. - for (var i = 0; i < keywords.length; i++) { - var {key, value} = keywords[i]; - assert(key === callFunction(std_String_toLowerCase, key) && - value === callFunction(std_String_toLowerCase, value), - "Keywords are already canonicalized to lower case"); - - - // UnicodeExtensionComponents ignores duplicate keys. - if (canonicalForm && i > 0 && keywords[i - 1].key === key) { - continue; - } - - extension += "-" + key; - - if (canonicalForm && - hasOwn(key, deprecatedUnicodeExtensionTypes) && - hasOwn(value, deprecatedUnicodeExtensionTypes[key])) - { - value = deprecatedUnicodeExtensionTypes[key][value]; - assert(value === callFunction(std_String_toLowerCase, value), - "Preferred keyword value is already in lower case"); - } - - // Type value "true" is removed. - if (value !== "" && value !== "true") - extension += "-" + value; - } - - return extension; -} - -/** - * CanonicalizeTransformExtension - * - * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>: - * - * - These subtags are all in lowercase (that is the canonical casing for these - * subtags), [...]. - * - * And per <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: - * - * - All keywords and tfields are sorted by alphabetical order of their keys, - * within their respective extensions. - */ -function CanonicalizeTransformExtension(localeObj, fields) { - assert(localeObj !== undefined || fields.length > 0, - "unexpected empty Transform locale extension components"); - - if (fields.length > 0) { - function TransformKeySort(left, right) { - var leftKey = left.key; - var rightKey = right.key; - assert(leftKey.length === 2, "left key is a Transform key"); - assert(rightKey.length === 2, "right key is a Transform key"); - - // Compare both strings using charCodeAt(), because relational - // string comparison always calls into the VM, whereas charCodeAt - // can be inlined by Ion. - var diff = callFunction(std_String_charCodeAt, leftKey, 0) - - callFunction(std_String_charCodeAt, rightKey, 0); - if (diff === 0) { - diff = callFunction(std_String_charCodeAt, leftKey, 1) - - callFunction(std_String_charCodeAt, rightKey, 1); - } - return diff; - } - - callFunction(ArraySort, fields, TransformKeySort); - } - - var extension = "t"; - - // Append the language subtag if present. - if (localeObj !== undefined) { - // [1] is a bit unclear whether or not the `tlang` subtag also needs - // to be canonicalized (and case-adjusted). For now simply append it as - // is and change it to all lower-case. If we switch to [2], the `tlang` - // subtag also needs to be canonicalized according to the same rules as - // `unicode_language_id` subtags are canonicalized. Also see [3]. - // - // [1] https://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier - // [2] https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers - // [3] https://github.com/tc39/ecma402/issues/330 - var localeStr = StringFromLanguageTagObject(localeObj); - extension += "-" + callFunction(std_String_toLowerCase, localeStr); - } - - // Append all fields. - for (var i = 0; i < fields.length; i++) { - // UTS 35, 3.2.1 specifies: - // - Any type or tfield value "true" is removed. - // - // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so - // ignore this apparently invalid part of the UTS 35 specification and - // simply append all `tfield` subtags. - var {key, value} = fields[i]; - extension += "-" + key + "-" + value; - } - - return extension; -} - -/** - * Canonicalizes the given structurally valid BCP 47 language tag, including - * regularized case of subtags. For example, the language tag - * Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, where - * - * Zh ; 2*3ALPHA - * -haNS ; ["-" script] - * -bu ; ["-" region] - * -variant2 ; *("-" variant) - * -Variant1 - * -u-ca-chinese ; *("-" extension) - * -t-Zh-laTN - * -x-PRIVATE ; ["-" privateuse] - * - * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private - * - * Spec: ECMAScript Internationalization API Specification, 6.2.3. - */ -function CanonicalizeLanguageTag(locale) { - var localeObj = parseLanguageTag(locale); - assert(localeObj !== null, "CanonicalizeLanguageTag"); - - CanonicalizeLanguageTagObject(localeObj); - - return StringFromLanguageTagObject(localeObj); -} - -/** - * Returns the string representation of the given language tag object. - */ -function StringFromLanguageTagObject(localeObj) { - assert(IsObject(localeObj), "StringFromLanguageTagObject"); - - var { - language, - script, - region, - variants, - extensions, - privateuse, - } = localeObj; - - var canonical = language; - - if (script !== undefined) - canonical += "-" + script; - - if (region !== undefined) - canonical += "-" + region; - - if (variants.length > 0) - canonical += "-" + callFunction(std_Array_join, variants, "-"); - - if (extensions.length > 0) - canonical += "-" + callFunction(std_Array_join, extensions, "-"); - - if (privateuse !== undefined) - canonical += "-" + privateuse; - - return canonical; -} - /** * Returns true if the input contains only ASCII alphabetical characters. */ function IsASCIIAlphaString(s) { assert(typeof s === "string", "IsASCIIAlphaString"); for (var i = 0; i < s.length; i++) { var c = callFunction(std_String_charCodeAt, s, i); if (!((0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A))) return false; } return true; } -/** - * Validates and canonicalizes the given language tag. - */ -function ValidateAndCanonicalizeLanguageTag(locale) { - assert(typeof locale === "string", "ValidateAndCanonicalizeLanguageTag"); - - // Handle the common case (a standalone language) first. - // Only the following Unicode BCP 47 locale identifier subset is accepted: - // unicode_locale_id = unicode_language_id - // unicode_language_id = unicode_language_subtag - // unicode_language_subtag = alpha{2,3} - if (locale.length === 2 || locale.length === 3) { - if (!IsASCIIAlphaString(locale)) - ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale); - assert(IsStructurallyValidLanguageTag(locale), "2*3ALPHA is a valid language tag"); - - // The language subtag is canonicalized to lower case. - locale = callFunction(std_String_toLowerCase, locale); - - // updateLocaleIdMappings may modify tags containing only |language| - // subtags, if the language is in |complexLanguageMappings|, so we need - // to handle that case first. - if (!hasOwn(locale, complexLanguageMappings)) { - // Replace deprecated subtags with their preferred values. - locale = hasOwn(locale, languageMappings) - ? languageMappings[locale] - : locale; - assert(locale === CanonicalizeLanguageTag(locale), "expected same canonicalization"); - - return locale; - } - } - - var localeObj = parseLanguageTag(locale); - if (localeObj === null) - ThrowRangeError(JSMSG_INVALID_LANGUAGE_TAG, locale); - - CanonicalizeLanguageTagObject(localeObj) - - return StringFromLanguageTagObject(localeObj); -} - // The last-ditch locale is used if none of the available locales satisfies a // request. "en-GB" is used based on the assumptions that English is the most // common second language, that both en-GB and en-US are normally available in // an implementation, and that en-GB is more representative of the English used // in other locales. function lastDitchLocale() { // Per bug 1177929, strings don't clone out of self-hosted code as atoms, // breaking IonBuilder::constant. Put this in a function for now. @@ -1242,24 +172,20 @@ var localeCache = { */ function DefaultLocaleIgnoringAvailableLocales() { const runtimeDefaultLocale = RuntimeDefaultLocale(); if (runtimeDefaultLocale === localeCandidateCache.runtimeDefaultLocale) return localeCandidateCache.candidateDefaultLocale; // If we didn't get a cache hit, compute the candidate default locale and // cache it. Fall back on the last-ditch locale when necessary. - var candidate = parseLanguageTag(runtimeDefaultLocale); + var candidate = intl_TryValidateAndCanonicalizeLanguageTag(runtimeDefaultLocale); if (candidate === null) { candidate = lastDitchLocale(); } else { - CanonicalizeLanguageTagObject(candidate); - - candidate = StringFromLanguageTagObject(candidate); - // The default locale must be in [[availableLocales]], and that list // must not contain any locales with Unicode extension sequences, so // remove any present in the candidate. candidate = removeUnicodeExtensions(candidate); if (hasOwn(candidate, oldStyleLanguageTagMappings)) candidate = oldStyleLanguageTagMappings[candidate]; } @@ -1345,22 +271,22 @@ function addSpecialMissingLanguageTags(a * Spec: ECMAScript Internationalization API Specification, 9.2.1. */ function CanonicalizeLocaleList(locales) { // Step 1. if (locales === undefined) return []; // Step 3 (and the remaining steps). - if (typeof locales === "string") - return [ValidateAndCanonicalizeLanguageTag(locales)]; - - var unboxedLocale = LocaleToStringOrNull(locales); - if (unboxedLocale !== null) - return [unboxedLocale]; + var tag = intl_ValidateAndCanonicalizeLanguageTag(locales, false); + if (tag !== null) { + assert(typeof tag === "string", + "intl_ValidateAndCanonicalizeLanguageTag returns a string value"); + return [tag]; + } // Step 2. var seen = []; // Step 4. var O = ToObject(locales); // Step 5. @@ -1376,20 +302,19 @@ function CanonicalizeLocaleList(locales) // Step 7.c.i. var kValue = O[k]; // Step 7.c.ii. if (!(typeof kValue === "string" || IsObject(kValue))) ThrowTypeError(JSMSG_INVALID_LOCALES_ELEMENT); // Steps 7.c.iii-iv. - var unboxedLocale = LocaleToStringOrNull(kValue); - var tag = unboxedLocale !== null - ? unboxedLocale - : ValidateAndCanonicalizeLanguageTag(ToString(kValue)); + var tag = intl_ValidateAndCanonicalizeLanguageTag(kValue, true); + assert(typeof tag === "string", + "ValidateAndCanonicalizeLanguageTag returns a string value"); // Step 7.c.v. if (callFunction(ArrayIndexOf, seen, tag) === -1) _DefineDataProperty(seen, seen.length, tag); } // Step 7.d. k++;
--- a/js/src/builtin/intl/LanguageTag.cpp +++ b/js/src/builtin/intl/LanguageTag.cpp @@ -1588,10 +1588,93 @@ bool ParseStandaloneRegionTag(HandleLine return false; } result.set(str->twoByteRange(nogc)); } result.toUpperCase(); return true; } +template <typename CharT> +static bool IsAsciiLowercaseAlpha(const mozilla::Range<const CharT>& range) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + const CharT* ptr = range.begin().get(); + size_t length = range.length(); + return std::all_of(ptr, ptr + length, mozilla::IsAsciiLowercaseAlpha<CharT>); +} + +static bool IsAsciiLowercaseAlpha(JSLinearString* str) { + JS::AutoCheckCannotGC nogc; + return str->hasLatin1Chars() ? IsAsciiLowercaseAlpha(str->latin1Range(nogc)) + : IsAsciiLowercaseAlpha(str->twoByteRange(nogc)); +} + +template <typename CharT> +static bool IsAsciiAlpha(const mozilla::Range<const CharT>& range) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + const CharT* ptr = range.begin().get(); + size_t length = range.length(); + return std::all_of(ptr, ptr + length, mozilla::IsAsciiAlpha<CharT>); +} + +static bool IsAsciiAlpha(JSLinearString* str) { + JS::AutoCheckCannotGC nogc; + return str->hasLatin1Chars() ? IsAsciiAlpha(str->latin1Range(nogc)) + : IsAsciiAlpha(str->twoByteRange(nogc)); +} + +JS::Result<JSString*> ParseStandaloneISO639LanguageTag(JSContext* cx, + HandleLinearString str) { + // ISO-639 language codes contain either two or three characters. + size_t length = str->length(); + if (length != 2 && length != 3) { + return nullptr; + } + + // We can directly the return the input below if it's in the correct case. + bool isLowerCase = IsAsciiLowercaseAlpha(str); + if (!isLowerCase) { + // Must be an ASCII alpha string. + if (!IsAsciiAlpha(str)) { + return nullptr; + } + } + + LanguageSubtag languageTag; + if (str->hasLatin1Chars()) { + JS::AutoCheckCannotGC nogc; + languageTag.set(str->latin1Range(nogc)); + } else { + JS::AutoCheckCannotGC nogc; + languageTag.set(str->twoByteRange(nogc)); + } + + if (!isLowerCase) { + // The language subtag is canonicalized to lower case. + languageTag.toLowerCase(); + } + + // Reject the input if the canonical tag contains more than just a single + // language subtag. + if (LanguageTag::complexLanguageMapping(languageTag)) { + return nullptr; + } + + // Take care to replace deprecated subtags with their preferred values. + JSString* result; + if (LanguageTag::languageMapping(languageTag) || !isLowerCase) { + auto range = languageTag.range(); + result = NewStringCopyN<CanGC>(cx, range.begin().get(), range.length()); + } else { + result = str; + } + if (!result) { + return cx->alreadyReportedOOM(); + } + return result; +} + } // namespace intl } // namespace js
--- a/js/src/builtin/intl/LanguageTag.h +++ b/js/src/builtin/intl/LanguageTag.h @@ -702,13 +702,21 @@ MOZ_MUST_USE bool ParseStandaloneScriptT /** * Parse a string as a standalone |region| tag. If |str| is a standalone region * tag, store it in case-normalized form in |result| and return true. Otherwise * return false. */ MOZ_MUST_USE bool ParseStandaloneRegionTag(JS::Handle<JSLinearString*> str, RegionSubtag& result); +/** + * Parse a string as an ISO-639 language code. Return |nullptr| in the result if + * the input could not be parsed or the canonical form of the resulting language + * tag contains more than a single language subtag. + */ +JS::Result<JSString*> ParseStandaloneISO639LanguageTag( + JSContext* cx, JS::Handle<JSLinearString*> str); + } // namespace intl } // namespace js #endif /* builtin_intl_LanguageTag_h */
--- a/js/src/builtin/intl/Locale.cpp +++ b/js/src/builtin/intl/Locale.cpp @@ -482,16 +482,39 @@ static bool ApplyUnicodeExtensionToTag(J // Insert the new Unicode extension string into the language tag. UniqueChars newExtensionChars(newExtension.extractOrCopyRawBuffer()); if (!newExtensionChars) { return false; } return tag.setUnicodeExtension(std::move(newExtensionChars)); } +static JS::Result<JSString*> LanguageTagFromMaybeWrappedLocale(JSContext* cx, + JSObject* obj) { + if (obj->is<LocaleObject>()) { + return obj->as<LocaleObject>().languageTag(); + } + + JSObject* unwrapped = CheckedUnwrapStatic(obj); + if (!unwrapped) { + ReportAccessDenied(cx); + return cx->alreadyReportedError(); + } + + if (!unwrapped->is<LocaleObject>()) { + return nullptr; + } + + RootedString tagStr(cx, unwrapped->as<LocaleObject>().languageTag()); + if (!cx->compartment()->wrap(cx, &tagStr)) { + return cx->alreadyReportedError(); + } + return tagStr.get(); +} + /** * Intl.Locale( tag[, options] ) */ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { CallArgs args = CallArgsFromVp(argc, vp); // Step 1. if (!ThrowIfNotConstructing(cx, args, "Intl.Locale")) { @@ -499,52 +522,37 @@ static bool Locale(JSContext* cx, unsign } // Steps 2-6 (Inlined 9.1.14, OrdinaryCreateFromConstructor). RootedObject proto(cx); if (!GetPrototypeFromBuiltinConstructor(cx, args, JSProto_Null, &proto)) { return false; } - // Step 7. - if (args.length() == 0 || (!args[0].isString() && !args[0].isObject())) { + // Steps 7-9. + HandleValue tagValue = args.get(0); + JSString* tagStr; + if (tagValue.isObject()) { + JS_TRY_VAR_OR_RETURN_FALSE( + cx, tagStr, + LanguageTagFromMaybeWrappedLocale(cx, &tagValue.toObject())); + if (!tagStr) { + tagStr = ToString(cx, tagValue); + if (!tagStr) { + return false; + } + } + } else if (tagValue.isString()) { + tagStr = tagValue.toString(); + } else { JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INVALID_LOCALES_ELEMENT); return false; } - // Steps 8-9. - RootedString tagStr(cx); - if (args[0].isObject()) { - JSObject* obj = &args[0].toObject(); - if (obj->is<LocaleObject>()) { - tagStr = obj->as<LocaleObject>().languageTag(); - } else { - JSObject* unwrapped = CheckedUnwrapStatic(obj); - if (!unwrapped) { - ReportAccessDenied(cx); - return false; - } - - if (unwrapped->is<LocaleObject>()) { - tagStr = unwrapped->as<LocaleObject>().languageTag(); - if (!cx->compartment()->wrap(cx, &tagStr)) { - return false; - } - } else { - tagStr = ToString(cx, args[0]); - if (!tagStr) { - return false; - } - } - } - } else { - tagStr = args[0].toString(); - } - RootedLinearString tagLinearStr(cx, tagStr->ensureLinear(cx)); if (!tagLinearStr) { return false; } // ApplyOptionsToTag, steps 2 and 9. LanguageTag tag(cx); if (!LanguageTagParser::parse(cx, tagLinearStr, tag)) { @@ -1280,8 +1288,115 @@ JSObject* js::CreateLocalePrototype(JSCo global->setReservedSlot(LOCALE_PROTO, ObjectValue(*localeProto)); return true; } bool js::AddLocaleConstructor(JSContext* cx, JS::Handle<JSObject*> intl) { return GlobalObject::addLocaleConstructor(cx, intl); } + +bool js::intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx, unsigned argc, + Value* vp) { + CallArgs args = CallArgsFromVp(argc, vp); + MOZ_ASSERT(args.length() == 2); + + HandleValue tagValue = args[0]; + bool applyToString = args[1].toBoolean(); + + if (tagValue.isObject()) { + JSString* tagStr; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, tagStr, + LanguageTagFromMaybeWrappedLocale(cx, &tagValue.toObject())); + if (tagStr) { + args.rval().setString(tagStr); + return true; + } + } + + if (!applyToString && !tagValue.isString()) { + args.rval().setNull(); + return true; + } + + JSString* tagStr = ToString(cx, tagValue); + if (!tagStr) { + return false; + } + + RootedLinearString tagLinearStr(cx, tagStr->ensureLinear(cx)); + if (!tagLinearStr) { + return false; + } + + // Handle the common case (a standalone language) first. + // Only the following Unicode BCP 47 locale identifier subset is accepted: + // unicode_locale_id = unicode_language_id + // unicode_language_id = unicode_language_subtag + // unicode_language_subtag = alpha{2,3} + JSString* language; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, language, intl::ParseStandaloneISO639LanguageTag(cx, tagLinearStr)); + if (language) { + args.rval().setString(language); + return true; + } + + LanguageTag tag(cx); + if (!LanguageTagParser::parse(cx, tagLinearStr, tag)) { + return false; + } + + if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) { + return false; + } + + JSStringBuilder sb(cx); + if (!tag.appendTo(cx, sb)) { + return false; + } + + JSString* resultStr = sb.finishString(); + if (!resultStr) { + return false; + } + args.rval().setString(resultStr); + return true; +} + +bool js::intl_TryValidateAndCanonicalizeLanguageTag(JSContext* cx, + unsigned argc, Value* vp) { + CallArgs args = CallArgsFromVp(argc, vp); + MOZ_ASSERT(args.length() == 1); + + RootedLinearString linear(cx, args[0].toString()->ensureLinear(cx)); + if (!linear) { + return false; + } + + LanguageTag tag(cx); + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, + LanguageTagParser::tryParse(cx, linear, tag)); + + // The caller handles invalid inputs. + if (!ok) { + args.rval().setNull(); + return true; + } + + if (!tag.canonicalize(cx, LanguageTag::UnicodeExtensionCanonicalForm::No)) { + return false; + } + + JSStringBuilder sb(cx); + if (!tag.appendTo(cx, sb)) { + return false; + } + + JSString* resultStr = sb.finishString(); + if (!resultStr) { + return false; + } + args.rval().setString(resultStr); + return true; +}
--- a/js/src/builtin/intl/Locale.h +++ b/js/src/builtin/intl/Locale.h @@ -44,11 +44,18 @@ class LocaleObject : public NativeObject return getFixedSlot(UNICODE_EXTENSION_SLOT); } }; extern JSObject* CreateLocalePrototype(JSContext* cx, JS::Handle<JSObject*> Intl, JS::Handle<GlobalObject*> global); +extern MOZ_MUST_USE bool intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx, + unsigned argc, + Value* vp); + +extern MOZ_MUST_USE bool intl_TryValidateAndCanonicalizeLanguageTag( + JSContext* cx, unsigned argc, Value* vp); + } // namespace js #endif /* builtin_intl_Locale_h */
--- a/js/src/vm/SelfHosting.cpp +++ b/js/src/vm/SelfHosting.cpp @@ -2075,45 +2075,16 @@ static bool intrinsic_ToNumeric(JSContex MOZ_ASSERT(args.length() == 1); if (!ToNumeric(cx, args[0])) { return false; } args.rval().set(args[0]); return true; } -static bool intrinsic_LocaleToStringOrNull(JSContext* cx, unsigned argc, - Value* vp) { - CallArgs args = CallArgsFromVp(argc, vp); - MOZ_ASSERT(args.length() == 1); - - if (!args[0].isObject()) { - args.rval().setNull(); - return true; - } - - JSObject* unwrapped = CheckedUnwrapStatic(&args[0].toObject()); - if (!unwrapped) { - ReportAccessDenied(cx); - return false; - } - - if (!unwrapped->is<LocaleObject>()) { - args.rval().setNull(); - return true; - } - - RootedString str(cx, unwrapped->as<LocaleObject>().languageTag()); - if (!cx->compartment()->wrap(cx, &str)) { - return false; - } - args.rval().setString(str); - return true; -} - // The self-hosting global isn't initialized with the normal set of builtins. // Instead, individual C++-implemented functions that're required by // self-hosted code are defined as global functions. Accessing these // functions via a content compartment's builtins would be unsafe, because // content script might have changed the builtins' prototypes' members. // Installing the whole set of builtins in the self-hosting compartment, OTOH, // would be wasteful: it increases memory usage and initialization time for // self-hosting compartment. @@ -2479,16 +2450,20 @@ static const JSFunctionSpec intrinsic_fu intl_PluralRules_availableLocales, 0, 0), JS_FN("intl_GetPluralCategories", intl_GetPluralCategories, 1, 0), JS_FN("intl_SelectPluralRule", intl_SelectPluralRule, 2, 0), JS_FN("intl_RelativeTimeFormat_availableLocales", intl_RelativeTimeFormat_availableLocales, 0, 0), JS_FN("intl_FormatRelativeTime", intl_FormatRelativeTime, 4, 0), JS_FN("intl_toLocaleLowerCase", intl_toLocaleLowerCase, 2, 0), JS_FN("intl_toLocaleUpperCase", intl_toLocaleUpperCase, 2, 0), + JS_FN("intl_ValidateAndCanonicalizeLanguageTag", + intl_ValidateAndCanonicalizeLanguageTag, 2, 0), + JS_FN("intl_TryValidateAndCanonicalizeLanguageTag", + intl_TryValidateAndCanonicalizeLanguageTag, 1, 0), JS_INLINABLE_FN("GuardToCollator", intrinsic_GuardToBuiltin<CollatorObject>, 1, 0, IntlGuardToCollator), JS_INLINABLE_FN("GuardToDateTimeFormat", intrinsic_GuardToBuiltin<DateTimeFormatObject>, 1, 0, IntlGuardToDateTimeFormat), JS_INLINABLE_FN("GuardToNumberFormat", intrinsic_GuardToBuiltin<NumberFormatObject>, 1, 0, @@ -2523,18 +2498,16 @@ static const JSFunctionSpec intrinsic_fu JS_FN("GetNumberFormatConstructor", intrinsic_GetBuiltinIntlConstructor< GlobalObject::getOrCreateNumberFormatConstructor>, 0, 0), JS_FN("RuntimeDefaultLocale", intrinsic_RuntimeDefaultLocale, 0, 0), JS_FN("IsRuntimeDefaultLocale", intrinsic_IsRuntimeDefaultLocale, 1, 0), #endif // ENABLE_INTL_API - JS_FN("LocaleToStringOrNull", intrinsic_LocaleToStringOrNull, 1, 0), - JS_FN("GetOwnPropertyDescriptorToArray", GetOwnPropertyDescriptorToArray, 2, 0), JS_INLINABLE_FN("IsRegExpObject", intrinsic_IsInstanceOfBuiltin<RegExpObject>, 1, 0, IsRegExpObject), JS_FN("CallRegExpMethodIfWrapped", CallNonGenericSelfhostedMethod<Is<RegExpObject>>, 2, 0),