Bug 1522070 - Part 5: Update comments to refer to Unicode BCP 47 locale identifiers. r=jwalden
authorAndré Bargull <andre.bargull@gmail.com>
Tue, 09 Apr 2019 09:16:44 +0000
changeset 468536 7b0c2144242cbaadc9ce80e0f5bfe804bf58ff6a
parent 468535 c5a97d3424310716d3a849dfb95f1ec86f7eb783
child 468537 1245a50cc3a0f1ef77e9b38c7c618415434207f8
push id112733
push usercsabou@mozilla.com
push dateTue, 09 Apr 2019 16:30:22 +0000
treeherdermozilla-inbound@e14dba56bbfd [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjwalden
bugs1522070
milestone68.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1522070 - Part 5: Update comments to refer to Unicode BCP 47 locale identifiers. r=jwalden - Strict parsing for "u" and "t" extensions is not yet implemented. - Canonicalisation per UTS 35 is also not yet implemented, so it still refers to BCP 47 tags. Differential Revision: https://phabricator.services.mozilla.com/D23540
js/src/builtin/intl/CommonFunctions.js
--- a/js/src/builtin/intl/CommonFunctions.js
+++ b/js/src/builtin/intl/CommonFunctions.js
@@ -19,18 +19,17 @@
 
 /**
  * Returns the start index of a "Unicode locale extension sequence", which the
  * specification defines as: "any substring of a language tag that starts with
  * a separator '-' and the singleton 'u' and includes the maximum sequence of
  * following non-singleton subtags and their preceding '-' separators."
  *
  * Alternatively, this may be defined as: the components of a language tag that
- * match the extension production in RFC 5646, where the singleton component is
- * "u".
+ * match the `unicode_locale_extensions` production in UTS 35.
  *
  * Spec: ECMAScript Internationalization API Specification, 6.2.1.
  */
 function startOfUnicodeExtensions(locale) {
     assert(typeof locale === "string", "locale is a string");
 
     // Search for "-u-" marking the start of a Unicode extension sequence.
     var start = callFunction(std_String_indexOf, locale, "-u-");
@@ -116,59 +115,50 @@ function getUnicodeExtensions(locale) {
     assert(start >= 0, "start of Unicode extension sequence not found");
     var end = endOfUnicodeExtensions(locale, start);
 
     return Substring(locale, start, end - start);
 }
 
 /* eslint-disable complexity */
 /**
- * Parser for BCP 47 language tags.
+ * Parser for Unicode BCP 47 locale identifiers.
  *
- * ---------------------------------------------------------------------------
- * The following features were removed because the spec was changed to use
- * Unicode BCP 47 locale identifier instead:
- * - extlang subtags
- * - irregular grandfathered language tags.
- * - regular grandfathered language tags with extlang-like subtags.
- * - privateuse-only language tags.
+ * ----------------------------------------------------------------------------
+ * | NB: While transitioning from BCP 47 language tags to Unicode BCP 47      |
+ * | locale identifiers, some parts of this parser may still follow RFC 5646. |
+ * ----------------------------------------------------------------------------
  *
- * The removed features may still be referenced in some comments. This will be
- * cleaned up when everything has been updated to follow the new specification.
- *
- * Ref: https://github.com/tc39/ecma402/pull/289
- * ---------------------------------------------------------------------------
- *
- * Returns null if |locale| can't be parsed as a Language-Tag. If the input is
- * a grandfathered language tag, the object
+ * Returns null if |locale| can't be parsed as a `unicode_locale_id`. If the
+ * input is a grandfathered language tag, the object
  *
  *   {
  *     locale: locale (normalized to canonical form),
  *     grandfathered: true,
  *   }
  *
  * is returned. Otherwise the returned object has the following structure:
  *
  *   {
- *     language: language subtag,
- *     script: script subtag / undefined,
- *     region: region subtag / undefined,
- *     variants: array of variant subtags,
- *     extensions: array of extension subtags,
- *     privateuse: privateuse subtag / undefined,
+ *     language: `unicode_language_subtag`,
+ *     script: `unicode_script_subtag` / undefined,
+ *     region: `unicode_region_subtag` / undefined,
+ *     variants: array of `unicode_variant_subtag`,
+ *     extensions: array of `extensions`,
+ *     privateuse: `pu_extensions` / undefined,
  *   }
  *
- * All language tag subtags are returned in their normalized case:
+ * All locale identifier subtags are returned in their normalized case:
  *
  *   var langtag = parseLanguageTag("en-latn-us");
  *   assertEq("en", langtag.language);
  *   assertEq("Latn", langtag.script);
  *   assertEq("US", langtag.region);
  *
- * Spec: RFC 5646 section 2.1.
+ * Spec: https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers
  */
 function parseLanguageTag(locale) {
     assert(typeof locale === "string", "locale is a string");
 
     // Current parse index in |locale|.
     var index = 0;
 
     // The three possible token type bits. Expressed as #defines to avoid
@@ -201,19 +191,19 @@ function parseLanguageTag(locale) {
            std_String_fromCharCode(LOWER_Z) === "z",
            "code unit constants should match the expected characters");
 
     // Reads the next token, returns |false| if an illegal character was
     // found, otherwise returns |true|.
     function nextToken() {
         var type = NONE;
         for (var i = index; i < locale.length; i++) {
-            // RFC 5234 section B.1
-            // ALPHA = %x41-5A / %x61-7A   ; A-Z / a-z
-            // DIGIT = %x30-39             ; 0-9
+            // UTS 35, section 3.1.
+            // alpha = [A-Z a-z] ;
+            // digit = [0-9] ;
             var c = callFunction(std_String_charCodeAt, locale, i);
             if ((UPPER_A <= c && c <= UPPER_Z) || (LOWER_A <= c && c <= LOWER_Z))
                 type |= ALPHA;
             else if (DIGIT_ZERO <= c && c <= DIGIT_NINE)
                 type |= DIGIT;
             else if (c === HYPHEN && i > index && i + 1 < locale.length)
                 break;
             else
@@ -222,17 +212,17 @@ function parseLanguageTag(locale) {
 
         token = type;
         tokenStart = index;
         tokenLength = i - index;
         index = i + 1;
         return true;
     }
 
-    // Language tags are compared and processed case-insensitively, so
+    // Locale identifiers are compared and processed case-insensitively, so
     // technically it's not necessary to adjust case. But for easier processing,
     // and because the canonical form for most subtags is lower case, we start
     // with lower case for all.
     //
     // Note that the tokenizer function keeps using the original input string
     // to properly detect non-ASCII characters. The lower-case string can't be
     // used to detect those characters, because some non-ASCII characters
     // lower-case map into ASCII characters, e.g. U+212A (KELVIN SIGN) lower-
@@ -249,126 +239,155 @@ function parseLanguageTag(locale) {
         return c;
     }
 
     // Returns the current token part transformed to lower-case.
     function tokenStringLower() {
         return Substring(localeLowercase, tokenStart, tokenLength);
     }
 
-    // Language-Tag = langtag           ; normal language tags
-    //              / grandfathered     ; grandfathered tags
+    // unicode_locale_id = unicode_language_id
+    //                     extensions*
+    //                     pu_extensions? ;
     if (!nextToken())
         return null;
 
-    // All Language-Tag productions start with the ALPHA token, have at least
-    // two characters, and contain less-or-equal to eight characters.
-
     var language, script, region, privateuse;
     var variants = [];
     var extensions = [];
 
-    // langtag = language
-    //           ["-" script]
-    //           ["-" region]
-    //           *("-" variant)
-    //           *("-" extension)
-    //           ["-" privateuse]
+    // unicode_language_id = unicode_language_subtag
+    //                       (sep unicode_script_subtag)?
+    //                       (sep unicode_region_subtag)?
+    //                       (sep unicode_variant_subtag)* ;
+    //
+    // sep                 = "-"
+    //
+    // Note: Unicode CLDR locale identifier backward compatibility extensions
+    //       removed from `unicode_language_id`.
 
-    // language = 2*3ALPHA          ; shortest ISO 639 code
-    //          / 5*8ALPHA          ; or registered language subtag
+    // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
     if (token !== ALPHA || tokenLength === 1 || tokenLength === 4 || tokenLength > 8) {
         // Four character language subtags are not allowed in Unicode BCP 47
         // locale identifiers. Also see the comparison to Unicode CLDR locale
         // identifiers in <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
         return null;
     }
     assert((2 <= tokenLength && tokenLength <= 3) ||
            (5 <= tokenLength && tokenLength <= 8),
            "language subtags have 2-3 or 5-8 letters");
 
     language = tokenStringLower();
     if (!nextToken())
         return null;
 
-    // script = 4ALPHA              ; ISO 15924 code
+    // unicode_script_subtag = alpha{4} ;
     if (tokenLength === 4 && token === ALPHA) {
         script = tokenStringLower();
 
         // The first character of a script code needs to be capitalized.
         // "hans" -> "Hans"
         script = callFunction(std_String_toUpperCase, script[0]) +
                  Substring(script, 1, script.length - 1);
 
         if (!nextToken())
             return null;
     }
 
-    // region = 2ALPHA              ; ISO 3166-1 code
-    //        / 3DIGIT              ; UN M.49 code
+    // unicode_region_subtag = (alpha{2} | digit{3}) ;
     if ((tokenLength === 2 && token === ALPHA) || (tokenLength === 3 && token === DIGIT)) {
         region = tokenStringLower();
 
         // Region codes need to be in upper-case. "bu" -> "BU"
         region = callFunction(std_String_toUpperCase, region);
 
         if (!nextToken())
             return null;
     }
 
-    // variant = 5*8alphanum        ; registered variants
-    //         / (DIGIT 3alphanum)
+    // unicode_variant_subtag = (alphanum{5,8}
+    //                        | digit alphanum{3}) ;
     //
-    // RFC 5646 section 2.1
-    // alphanum = (ALPHA / DIGIT)   ; letters and numbers
+    // alphanum               = [0-9 A-Z a-z] ;
     while ((5 <= tokenLength && tokenLength <= 8) ||
            (tokenLength === 4 && tokenStartCodeUnitLower() <= DIGIT_NINE))
     {
         assert(!(tokenStartCodeUnitLower() <= DIGIT_NINE) ||
                tokenStartCodeUnitLower() >= DIGIT_ZERO,
                "token-start-code-unit <= '9' implies token-start-code-unit is in '0'..'9'");
 
-        // Language tags are case insensitive (RFC 5646 section 2.1.1).
+        // Locale identifiers are case insensitive (UTS 35, section 3.2).
         // All seen variants are compared ignoring case differences by
         // using the lower-case form. This allows to properly detect and
         // reject variant repetitions with differing case, e.g.
         // "en-variant-Variant".
         var variant = tokenStringLower();
 
-        // Reject the language tag if a duplicate variant was found.
+        // Reject the Locale identifier if a duplicate variant was found.
         //
         // This linear-time verification step means the whole variant
         // subtag checking is potentially quadratic, but we're okay doing
         // that because language tags are unlikely to be deliberately
         // pathological.
         if (callFunction(ArrayIndexOf, variants, variant) !== -1)
             return null;
         _DefineDataProperty(variants, variants.length, variant);
 
         if (!nextToken())
             return null;
     }
 
-    // extension = singleton 1*("-" (2*8alphanum))
-    // singleton = DIGIT            ; 0 - 9
-    //           / %x41-57          ; A - W
-    //           / %x59-5A          ; Y - Z
-    //           / %x61-77          ; a - w
-    //           / %x79-7A          ; y - z
+    // extensions = unicode_locale_extensions
+    //            | transformed_extensions
+    //            | other_extensions ;
+    //
+    // unicode_locale_extensions = sep [uU]
+    //                             ((sep keyword)+
+    //                             |(sep attribute)+ (sep keyword)*) ;
+    //
+    // transformed_extensions = sep [tT]
+    //                          ((sep tlang (sep tfield)*)
+    //                          |(sep tfield)+) ;
+    //
+    // other_extensions = [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
+    //
+    // keyword = key (sep type)? ;
+    //
+    // key = alphanum alpha ;
+    //
+    // type = alphanum{3,8} (sep alphanum{3,8})* ;
+    //
+    // attribute = alphanum{3,8} ;
+    //
+    // tlang = unicode_language_subtag
+    //         (sep unicode_script_subtag)?
+    //         (sep unicode_region_subtag)?
+    //         (sep unicode_variant_subtag)* ;
+    //
+    // tfield = tkey tvalue;
+    //
+    // tkey = alpha digit ;
+    //
+    // tvalue = (sep alphanum{3,8})+ ;
+    //
+    // Note: unicode_locale_extensions and transformed_extensions are currently
+    //       parsed as other_extensions. That means for example we allow the
+    //       input "en-u-a0" even though "a0" can't be parsed as the `key`
+    //       production.
     var seenSingletons = [];
     while (tokenLength === 1) {
         var extensionStart = tokenStart;
         var singleton = tokenStartCodeUnitLower();
         if (singleton === LOWER_X)
             break;
 
-        // Language tags are case insensitive (RFC 5646 section 2.1.1).
+        // Locale identifiers are case insensitive (UTS 35, section 3.2).
         // Ensure |tokenStartCodeUnitLower()| does not return the code
         // unit of an upper-case character, so we can properly detect and
-        // reject language tags with different case, e.g. "en-u-foo-U-foo".
+        // reject singletons with different case, e.g. "en-u-foo-U-foo".
         assert(!(UPPER_A <= singleton && singleton <= UPPER_Z),
                "unexpected upper-case code unit");
 
         // Reject the input if a duplicate singleton was found.
         //
         // Similar to the variant validation step this check is O(n**2),
         // but given that there are only 35 possible singletons the
         // quadratic runtime is negligible.
@@ -386,19 +405,19 @@ function parseLanguageTag(locale) {
                 return null;
         } while (2 <= tokenLength && tokenLength <= 8);
 
         var extension = Substring(localeLowercase, extensionStart,
                                   (tokenStart - 1 - extensionStart));
         _DefineDataProperty(extensions, extensions.length, extension);
     }
 
-    // Trailing privateuse component of the langtag production.
+    // Trailing pu_extensions component of the unicode_locale_id production.
     //
-    // privateuse = "x" 1*("-" (1*8alphanum))
+    // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
     if (tokenLength === 1 && tokenStartCodeUnitLower() === LOWER_X) {
         var privateuseStart = tokenStart;
         if (!nextToken())
             return null;
 
         if (!(1 <= tokenLength && tokenLength <= 8))
             return null;
         do {