Bug 1503157 - Make selection of font preferences respect a script subtag in the lang attribute, if present. r=m_kato
authorJonathan Kew <jkew@mozilla.com>
Mon, 26 Nov 2018 11:31:37 +0000
changeset 504412 6a00129133730106dd9472b091456357f683b904
parent 504411 76207eef2ca82b95660a0c60adb2823da634f0a6
child 504413 485c48227d1550a3cd2afa10b2286ab3d22bef71
push id10290
push userffxbld-merge
push dateMon, 03 Dec 2018 16:23:23 +0000
treeherdermozilla-beta@700bed2445e6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersm_kato
bugs1503157
milestone65.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1503157 - Make selection of font preferences respect a script subtag in the lang attribute, if present. r=m_kato
intl/locale/MozLocale.cpp
intl/locale/MozLocale.h
intl/locale/langGroups.properties
intl/locale/moz.build
intl/locale/nsLanguageAtomService.cpp
testing/web-platform/meta/css/css-text/writing-system/writing-system-font-001.html.ini
--- a/intl/locale/MozLocale.cpp
+++ b/intl/locale/MozLocale.cpp
@@ -129,29 +129,29 @@ Locale::AsString() const
     for (const auto& subTag : mPrivateUse) {
       tag.AppendLiteral("-");
       tag.Append(subTag);
     }
   }
   return tag;
 }
 
-const nsACString&
+const nsCString&
 Locale::GetLanguage() const
 {
   return mLanguage;
 }
 
-const nsACString&
+const nsCString&
 Locale::GetScript() const
 {
   return mScript;
 }
 
-const nsACString&
+const nsCString&
 Locale::GetRegion() const
 {
   return mRegion;
 }
 
 const nsTArray<nsCString>&
 Locale::GetVariants() const
 {
--- a/intl/locale/MozLocale.h
+++ b/intl/locale/MozLocale.h
@@ -62,19 +62,19 @@ class Locale {
      * If the input language tag string is not well-formed, the Locale will be
      * created with its flag `mWellFormed` set to false which will make the Locale never match.
      */
     explicit Locale(const nsACString& aLocale);
     explicit Locale(const char* aLocale)
       : Locale(nsDependentCString(aLocale))
       { };
 
-    const nsACString& GetLanguage() const;
-    const nsACString& GetScript() const;
-    const nsACString& GetRegion() const;
+    const nsCString& GetLanguage() const;
+    const nsCString& GetScript() const;
+    const nsCString& GetRegion() const;
     const nsTArray<nsCString>& GetVariants() const;
 
     /**
      * Returns a `true` if the locale is well-formed, such that the
      * Locale object can validly be matched against others.
      */
     bool IsWellFormed() const {
       return mIsWellFormed;
deleted file mode 100644
--- a/intl/locale/langGroups.properties
+++ /dev/null
@@ -1,240 +0,0 @@
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-# References: http://www.omniglot.com/writing/atoz.htm
-#             http://www.loc.gov/standards/iso639-2/englangn.html
-#             http://www.ethnologue.com
-#             http://www.worldlanguage.com/Languages/
-#             http://www.rosettaproject.org/
-#             http://www.sweb.cz/ls78/diacritics.htm
-#    see also toolkit/locales/en-US/chrome/global/languageNames.properties
-#             and bug 178491
-#
-# Strictly speaking, Avestan did not use Arabic script but Aramaic
-# (arc)/Avestan script.)
-#ae=ar
-
-ab=x-cyrillic
-ach=x-western
-af=x-western
-alg=x-cans
-am=x-ethi
-an=x-western
-ar=ar
-as=x-beng
-ast=x-western
-ay=x-western
-az=x-western
-be=x-cyrillic
-bg=x-cyrillic
-bn=x-beng
-bo=x-tibt
-br=x-western
-bs=x-western
-ca=x-western
-cak=x-western
-ce=x-western
-ch=x-western
-co=x-western
-cr=x-cans
-crh=x-western
-cs=x-western
-csb=x-western
-#cu=x-cyrillic
-cv=x-cyrillic
-cy=x-western
-da=x-western
-de=x-western
-dsb=x-western
-#dv=Thaanna
-dz=x-tibt
-ee=x-western
-el=el
-en=x-western
-eo=x-western
-es=x-western
-et=x-western
-eu=x-western
-fa=ar
-ff=x-western
-fi=x-western
-fj=x-western
-fo=x-western
-fr=x-western
-fy=x-western
-ga=x-western
-gd=x-western
-gl=x-western
-gn=x-western
-#ha=x-western : Latin and Ajami scripts
-gu=x-gujr
-gv=x-western
-haw=x-western
-he=he
-hi=x-devanagari
-hil=x-western
-hr=x-western
-hsb=x-western
-ht=x-western
-hu=x-western
-hy=x-armn
-ia=x-western
-id=x-western
-ie=x-western
-is=x-western
-it=x-western
-iu=x-cans
-ja=ja
-ka=x-geor
-kab=x-western
-kk=x-cyrillic
-kl=x-western
-km=x-khmr
-kn=x-knda
-ko=ko
-kok=x-devanagari
-ks=ar
-# Arabic script is also used for Kurdish
-ku=x-western
-kw=x-western
-#ky=x-cyrillic
-la=x-western
-lb=x-western
-lij=x-western
-ln=x-western
-lt=x-western
-ltg=x-western
-lv=x-western
-mai=x-devanagari
-meh=x-western
-mg=x-western
-mh=x-western
-mi=x-western
-mix=x-western
-mk=x-cyrillic
-ml=x-mlym
-# Mongolian script is also used for Mongolian
-mn=x-cyrillic
-mr=x-devanagari
-ms=x-western
-mt=x-western
-na=x-western
-nb=x-western
-nd=x-western
-ne=x-devanagari
-nl=x-western
-nn=x-western
-no=x-western
-nr=x-western
-nso=x-western
-nv=x-western
-ny=x-western
-oc=x-western
-oj=x-cans
-om=x-western
-or=x-orya
-os=x-cyrillic
-#pa: Punjabi is usually written in Gurmukhi script in India and Arabic script
-#    in Pakistan. We make pa default to Gurmukhi based on comments in bug 248690
-pa-in=x-guru
-pa-pk=ar
-pa=x-guru
-pl=x-western
-ps=ar
-pt=x-western
-qu=x-western
-rm=x-western
-rn=x-western
-ro=x-western
-ru=x-cyrillic
-rw=x-western
-sa=x-devanagari
-sc=x-western
-sd=ar
-# African language (but related with French)
-sg=x-western
-sh=x-western
-si=x-sinh
-sk=x-western
-sl=x-western
-sm=x-western
-so=x-western
-son=x-western
-sq=x-western
-sr=x-cyrillic
-ss=x-western
-st=x-western
-sv=x-western
-sw=x-western
-ta=x-tamil
-te=x-telu
-th=th
-ti=x-ethi
-tig=x-ethi
-tk=x-cyrillic
-#tk=x-western # (The country declared in 1992 to gradually move to Latin script)
-tl=x-western
-tlh=x-western
-tn=x-western
-to=x-western
-tr=x-western
-ts=x-western
-tt=x-western
-uk=x-cyrillic
-ur=ar
-uz=x-western
-ve=x-western
-vi=x-western
-vo=x-western
-wa=x-western
-wo=x-western
-xh=x-western
-yi=he
-yo=x-western
-zam=x-western
-zh-cn=zh-CN
-# XXX : The following five entries are added as a quick fix (bug 251241, bug 1104589).
-# When we have a general solution for ISO 15924 (script codes), the issue has
-# to be revisited.
-zh-hans=zh-CN
-zh-hant=zh-TW
-zh-latn=x-western
-ja-latn=x-western
-ko-latn=x-western
-#
-zh-tw=zh-TW
-zh-hk=zh-HK
-zh=zh-CN
-zh-min-nan=x-western
-zu=x-western
-#
-#==============================
-#
-# mapping mozilla's internal x-* to themselves (see bug 256257)
-x-western=x-western
-x-cyrillic=x-cyrillic
-# el
-# he
-# ar
-# th
-# ja
-# zh-CN
-# ko
-# zh-TW
-x-tamil=x-tamil
-x-devanagari=x-devanagari
-x-unicode=x-unicode
-x-armn=x-armn
-x-geor=x-geor
-x-math=x-math
-# These self-mappings are not necessary unless somebody use them to specify
-# lang in (X)HTML/XML documents, which they shouldn't. (see bug 256257)
-#x-beng=x-beng
-#x-cans=x-cans
-#x-ethi=x-ethi
-#x-guru=x-guru
-#x-gujr=x-gujr
-#x-khmr=x-khmr
-#x-mlym=x-mlym
--- a/intl/locale/moz.build
+++ b/intl/locale/moz.build
@@ -60,17 +60,16 @@ LOCAL_INCLUDES += [
 ]
 
 RESOURCE_FILES += [
     'language.properties',
 ]
 
 prefixes = (
     'encodingsgroups',
-    'langGroups',
 )
 
 for prefix in prefixes:
     input_file = prefix + '.properties'
     header = prefix + '.properties.h'
     GENERATED_FILES += [header]
     props = GENERATED_FILES[header]
     props.script = 'props2arrays.py'
--- a/intl/locale/nsLanguageAtomService.cpp
+++ b/intl/locale/nsLanguageAtomService.cpp
@@ -15,18 +15,78 @@
 
 using namespace mozilla;
 using mozilla::intl::OSPreferences;
 
 static constexpr nsUConvProp encodingsGroups[] = {
 #include "encodingsgroups.properties.h"
 };
 
-static constexpr nsUConvProp kLangGroups[] = {
-#include "langGroups.properties.h"
+// List of mozilla internal x-* tags that map to themselves (see bug 256257)
+static constexpr const char* kLangGroups[] = {
+  // This list must be sorted!
+  "x-armn",
+  "x-cyrillic",
+  "x-devanagari",
+  "x-geor",
+  "x-math",
+  "x-tamil",
+  "x-unicode",
+  "x-western"
+  // These self-mappings are not necessary unless somebody use them to specify
+  // lang in (X)HTML/XML documents, which they shouldn't. (see bug 256257)
+  // x-beng=x-beng
+  // x-cans=x-cans
+  // x-ethi=x-ethi
+  // x-guru=x-guru
+  // x-gujr=x-gujr
+  // x-khmr=x-khmr
+  // x-mlym=x-mlym
+};
+
+// Map ISO 15924 script codes from BCP47 lang tag to mozilla's langGroups.
+static constexpr struct {
+  const char* mTag;
+  nsAtom*     mAtom;
+} kScriptLangGroup[] =
+{
+  // This list must be sorted by script code!
+  { "Arab", nsGkAtoms::ar },
+  { "Armn", nsGkAtoms::x_armn },
+  { "Beng", nsGkAtoms::x_beng },
+  { "Cans", nsGkAtoms::x_cans },
+  { "Cyrl", nsGkAtoms::x_cyrillic },
+  { "Deva", nsGkAtoms::x_devanagari },
+  { "Ethi", nsGkAtoms::x_ethi },
+  { "Geok", nsGkAtoms::x_geor },
+  { "Geor", nsGkAtoms::x_geor },
+  { "Grek", nsGkAtoms::el },
+  { "Gujr", nsGkAtoms::x_gujr },
+  { "Guru", nsGkAtoms::x_guru },
+  { "Hang", nsGkAtoms::ko },
+  { "Hani", nsGkAtoms::Japanese },
+  { "Hans", nsGkAtoms::Chinese },
+  // Hant is special-cased in code
+  // Hant=zh-HK
+  // Hant=zh-TW
+  { "Hebr", nsGkAtoms::he },
+  { "Hira", nsGkAtoms::Japanese },
+  { "Jpan", nsGkAtoms::Japanese },
+  { "Kana", nsGkAtoms::Japanese },
+  { "Khmr", nsGkAtoms::x_khmr },
+  { "Knda", nsGkAtoms::x_knda },
+  { "Kore", nsGkAtoms::ko },
+  { "Latn", nsGkAtoms::x_western },
+  { "Mlym", nsGkAtoms::x_mlym },
+  { "Orya", nsGkAtoms::x_orya },
+  { "Sinh", nsGkAtoms::x_sinh },
+  { "Taml", nsGkAtoms::x_tamil },
+  { "Telu", nsGkAtoms::x_telu },
+  { "Thai", nsGkAtoms::th },
+  { "Tibt", nsGkAtoms::x_tibt }
 };
 
 // static
 nsLanguageAtomService*
 nsLanguageAtomService::GetService()
 {
   static UniquePtr<nsLanguageAtomService> gLangAtomService;
   if (!gLangAtomService) {
@@ -106,29 +166,53 @@ nsLanguageAtomService::GetLanguageGroup(
 
 already_AddRefed<nsAtom>
 nsLanguageAtomService::GetUncachedLanguageGroup(nsAtom* aLanguage) const
 {
   nsAutoCString langStr;
   aLanguage->ToUTF8String(langStr);
   ToLowerCase(langStr);
 
-  nsAutoCString langGroupStr;
-  nsresult res =
-    nsUConvPropertySearch::SearchPropertyValue(kLangGroups,
-                                               ArrayLength(kLangGroups),
-                                               langStr, langGroupStr);
-  while (NS_FAILED(res)) {
-    int32_t hyphen = langStr.RFindChar('-');
-    if (hyphen <= 0) {
-      langGroupStr.AssignLiteral("x-unicode");
-      break;
+  RefPtr<nsAtom> langGroup;
+  if (langStr[0] == 'x' && langStr[1] == '-') {
+    // Internal x-* langGroup codes map to themselves (see bug 256257)
+    size_t unused;
+    if (BinarySearchIf(kLangGroups, 0, ArrayLength(kLangGroups),
+                       [&langStr](const char* tag) -> int {
+                         return langStr.Compare(tag);
+                       },
+                       &unused)) {
+      langGroup = NS_Atomize(langStr);
+      return langGroup.forget();
     }
-    langStr.Truncate(hyphen);
-    res = nsUConvPropertySearch::SearchPropertyValue(kLangGroups,
-                                                     ArrayLength(kLangGroups),
-                                                     langStr, langGroupStr);
+  } else {
+    // If the lang code can be parsed as BCP47, look up its (likely) script
+    Locale loc(langStr);
+    if (loc.IsWellFormed()) {
+      if (loc.GetScript().IsEmpty()) {
+        loc.AddLikelySubtags();
+      }
+      if (loc.GetScript().EqualsLiteral("Hant")) {
+        if (loc.GetRegion().EqualsLiteral("HK")) {
+          langGroup = nsGkAtoms::HongKongChinese;
+        } else {
+          langGroup = nsGkAtoms::Taiwanese;
+        }
+        return langGroup.forget();
+      } else {
+        size_t foundIndex;
+        const nsCString& script = loc.GetScript();
+        if (BinarySearchIf(kScriptLangGroup, 0, ArrayLength(kScriptLangGroup),
+                           [script](const auto& entry) -> int {
+                             return script.Compare(entry.mTag);
+                           },
+                           &foundIndex)) {
+          langGroup = kScriptLangGroup[foundIndex].mAtom;
+          return langGroup.forget();
+        }
+      }
+    }
   }
 
-  RefPtr<nsAtom> langGroup = NS_Atomize(langGroupStr);
-
+  // Fall back to x-unicode if no match was found
+  langGroup = nsGkAtoms::Unicode;
   return langGroup.forget();
 }
deleted file mode 100644
--- a/testing/web-platform/meta/css/css-text/writing-system/writing-system-font-001.html.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-[writing-system-font-001.html]
-  expected: FAIL