Bug 1370497 - Check ScriptExtensions property of combining marks when available. r=valentin, a=ritu
authorJonathan Kew <jkew@mozilla.com>
Wed, 27 Sep 2017 11:16:35 +0100
changeset 432172 fd0f5f771d44e1edc000fc3c27d0ce991f7b54dc
parent 432171 f82b141026857dab37f9302bd1dd8e8249969217
child 432173 425fc16c98396fe1d54ff92bd6922c22d44e7aca
push id7896
push userryanvm@gmail.com
push dateThu, 05 Oct 2017 00:26:16 +0000
treeherdermozilla-beta@8c7645234f86 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersvalentin, ritu
bugs1370497
milestone57.0
Bug 1370497 - Check ScriptExtensions property of combining marks when available. r=valentin, a=ritu
netwerk/dns/nsIDNService.cpp
netwerk/test/unit/test_idn_urls.js
--- a/netwerk/dns/nsIDNService.cpp
+++ b/netwerk/dns/nsIDNService.cpp
@@ -21,16 +21,17 @@
 // Currently we use the non-transitional processing option -- see
 // http://unicode.org/reports/tr46/
 // To switch to transitional processing, change the value of this flag
 // and kTransitionalProcessing in netwerk/test/unit/test_idna2008.js to true
 // (revert bug 1218179).
 const bool kIDNA2008_TransitionalProcessing = false;
 
 #include "ICUUtils.h"
+#include "unicode/uscript.h"
 #endif
 
 using namespace mozilla::unicode;
 
 //-----------------------------------------------------------------------------
 // RFC 1034 - 3.1. Name space specifications and terminology
 static const uint32_t kMaxDNSNodeLen = 63;
 // RFC 3490 - 5.   ACE prefix
@@ -895,33 +896,63 @@ bool nsIDNService::isLabelSafe(const nsA
         script != lastScript) {
       if (illegalScriptCombo(script, savedScript)) {
         return false;
       }
       lastScript = script;
     }
 
     // Check for mixed numbering systems
-    if (GetGeneralCategory(ch) ==
-        HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER) {
+    auto genCat = GetGeneralCategory(ch);
+    if (genCat == HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER) {
       uint32_t zeroCharacter = ch - GetNumericValue(ch);
       if (savedNumberingSystem == 0) {
         // If we encounter a decimal number, save the zero character from that
         // numbering system.
         savedNumberingSystem = zeroCharacter;
       } else if (zeroCharacter != savedNumberingSystem) {
         return false;
       }
     }
 
-    // Check for consecutive non-spacing marks
-    if (previousChar != 0 &&
-        previousChar == ch &&
-        GetGeneralCategory(ch) == HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) {
-      return false;
+    if (genCat == HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) {
+      // Check for consecutive non-spacing marks.
+      if (previousChar != 0 && previousChar == ch) {
+        return false;
+      }
+      // Check for marks whose expected script doesn't match the base script.
+      if (lastScript != Script::INVALID) {
+        const size_t kMaxScripts = 32; // more than ample for current values
+                                       // of ScriptExtensions property
+        UScriptCode scripts[kMaxScripts];
+        UErrorCode errorCode = U_ZERO_ERROR;
+        int nScripts = uscript_getScriptExtensions(ch, scripts, kMaxScripts,
+                                                   &errorCode);
+        MOZ_ASSERT(U_SUCCESS(errorCode), "uscript_getScriptExtensions failed");
+        if (U_FAILURE(errorCode)) {
+          return false;
+        }
+        // nScripts will always be >= 1, because even for undefined characters
+        // uscript_getScriptExtensions will return Script::INVALID.
+        // If the mark just has script=COMMON or INHERITED, we can't check any
+        // more carefully, but if it has specific scriptExtension codes, then
+        // assume those are the only valid scripts to use it with.
+        if (nScripts > 1 ||
+            (Script(scripts[0]) != Script::COMMON &&
+             Script(scripts[0]) != Script::INHERITED)) {
+          while (--nScripts >= 0) {
+            if (Script(scripts[nScripts]) == lastScript) {
+              break;
+            }
+          }
+          if (nScripts == -1) {
+            return false;
+          }
+        }
+      }
     }
 
     // Simplified/Traditional Chinese check temporarily disabled -- bug 857481
 #if 0
 
     // Check for both simplified-only and traditional-only Chinese characters
     HanVariantType hanVariant = GetHanVariant(ch);
     if (hanVariant == HVT_SimplifiedOnly || hanVariant == HVT_TraditionalOnly) {
--- a/netwerk/test/unit/test_idn_urls.js
+++ b/netwerk/test/unit/test_idn_urls.js
@@ -290,16 +290,21 @@ const testcases = [
 
     // Effect of adding valid or invalid subdomains (bug 1399540)
     ["䕮䕵䕶䕱.ascii", "xn--google.ascii",                       false, true,  true],
     ["ascii.䕮䕵䕶䕱", "ascii.xn--google",                       false, true,  true],
     ["中国123.䕮䕵䕶䕱", "xn--123-u68dy61b.xn--google",           false, true,  true],
     ["䕮䕵䕶䕱.中国123", "xn--google.xn--123-u68dy61b",           false, true,  true],
     ["xn--accountlogin.䕮䕵䕶䕱", "xn--accountlogin.xn--google", false, true,  true],
     ["䕮䕵䕶䕱.xn--accountlogin", "xn--google.xn--accountlogin", false, true,  true],
+
+    // Arabic diacritic not allowed in Latin text (bug 1370497)
+    ["goo\u0650gle", "xn--google-yri", false, false, false],
+    // ...but Arabic diacritics are allowed on Arabic text
+    ["العَرَبِي", "xn--mgbc0a5a6cxbzabt", false, true, true],
 ];
 
 const profiles = ["ASCII", "high", "moderate"];
 
 function run_test() {
     var pbi = Cc["@mozilla.org/preferences-service;1"].getService(Ci.nsIPrefBranch);
     var oldProfile = pbi.getCharPref("network.IDN.restriction_profile", "moderate");
     var oldWhitelistCom = pbi.getBoolPref("network.IDN.whitelist.com", false);