Bug 910211 - Guess the fallback encoding from the top-level domain when feasible. r=emk.
authorHenri Sivonen <hsivonen@hsivonen.fi>
Thu, 06 Feb 2014 11:08:01 +0200
changeset 185444 a4e9e8bead92c9d51d4e478a73e8e589263e92ae
parent 185443 84b6f0aba30d13304476993becd8cb89a65526fd
child 185445 246619a2799221016d400aec982cbf02d378c28e
push id474
push userasasaki@mozilla.com
push dateMon, 02 Jun 2014 21:01:02 +0000
treeherdermozilla-release@967f4cf1b31c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersemk
bugs910211
milestone30.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 910211 - Guess the fallback encoding from the top-level domain when feasible. r=emk.
build/pgo/server-locations.txt
content/html/document/src/nsHTMLDocument.cpp
content/html/document/src/nsHTMLDocument.h
docshell/base/nsDocShell.cpp
dom/encoding/FallbackEncoding.cpp
dom/encoding/FallbackEncoding.h
dom/encoding/Makefile.in
dom/encoding/domainsfallbacks.properties
dom/encoding/moz.build
dom/encoding/nonparticipatingdomains.properties
dom/encoding/test/file_TLD.html
dom/encoding/test/mochitest.ini
dom/encoding/test/test_TLD.html
modules/libpref/src/init/all.js
parser/nsCharsetSource.h
--- a/build/pgo/server-locations.txt
+++ b/build/pgo/server-locations.txt
@@ -200,8 +200,15 @@ http://xn--lve-6lad.w3c-test.org:83
 # HTTPS versions of the above
 https://w3c-test.org:443
 https://www.w3c-test.org:443
 https://www1.w3c-test.org:443
 https://www2.w3c-test.org:443
 https://xn--n8j6ds53lwwkrqhv28a.w3c-test.org:443
 https://xn--lve-6lad.w3c-test.org:443
 http://test.w3.org:80
+
+# Hosts for testing TLD-based fallback encoding
+http://example.tw:80                privileged
+http://example.cn:80                privileged
+http://example.co.jp:80             privileged
+http://example.fi:80                privileged
+
--- a/content/html/document/src/nsHTMLDocument.cpp
+++ b/content/html/document/src/nsHTMLDocument.cpp
@@ -431,16 +431,76 @@ nsHTMLDocument::TryParentCharset(nsIDocS
     }
 
     aCharset.Assign(parentCharset);
     aCharsetSource = kCharsetFromParentFrame;
   }
 }
 
 void
+nsHTMLDocument::TryTLD(int32_t& aCharsetSource, nsACString& aCharset)
+{
+  if (aCharsetSource >= kCharsetFromTopLevelDomain) {
+    return;
+  }
+  if (!FallbackEncoding::sGuessFallbackFromTopLevelDomain) {
+    return;
+  }
+  if (!mDocumentURI) {
+    return;
+  }
+  nsAutoCString host;
+  mDocumentURI->GetAsciiHost(host);
+  if (host.IsEmpty()) {
+    return;
+  }
+  // First let's see if the host is DNS-absolute and ends with a dot and
+  // get rid of that one.
+  if (host.Last() == '.') {
+    host.SetLength(host.Length() - 1);
+    if (host.IsEmpty()) {
+      return;
+    }
+  }
+  // If we still have a dot, the host is weird, so let's continue only
+  // if we have something other than a dot now.
+  if (host.Last() == '.') {
+    return;
+  }
+  int32_t index = host.RFindChar('.');
+  if (index == kNotFound) {
+    // We have an intranet host, Gecko-internal URL or an IPv6 address.
+    return;
+  }
+  // Since the string didn't end with a dot and we found a dot,
+  // there is at least one character between the dot and the end of
+  // the string, so taking the substring below is safe.
+  nsAutoCString tld;
+  ToLowerCase(Substring(host, index + 1, host.Length() - (index + 1)), tld);
+  // Reject generic TLDs and country TLDs that need more research
+  if (!FallbackEncoding::IsParticipatingTopLevelDomain(tld)) {
+    return;
+  }
+  // Check if we have an IPv4 address
+  bool seenNonDigit = false;
+  for (size_t i = 0; i < tld.Length(); ++i) {
+    char c = tld.CharAt(i);
+    if (c < '0' || c > '9') {
+      seenNonDigit = true;
+      break;
+    }
+  }
+  if (!seenNonDigit) {
+    return;
+  }
+  aCharsetSource = kCharsetFromTopLevelDomain;
+  FallbackEncoding::FromTopLevelDomain(tld, aCharset);
+}
+
+void
 nsHTMLDocument::TryFallback(int32_t& aCharsetSource, nsACString& aCharset)
 {
   if (kCharsetFromFallback <= aCharsetSource)
     return;
 
   aCharsetSource = kCharsetFromFallback;
   FallbackEncoding::FromLocale(aCharset);
 }
@@ -656,16 +716,17 @@ nsHTMLDocument::StartDocumentLoad(const 
 
     TryHintCharset(muCV, charsetSource, charset); // XXX mailnews-only
     TryParentCharset(docShell, charsetSource, charset);
 
     if (cachingChan && !urlSpec.IsEmpty()) {
       TryCacheCharset(cachingChan, charsetSource, charset);
     }
 
+    TryTLD(charsetSource, charset);
     TryFallback(charsetSource, charset);
 
     if (wyciwygChannel) {
       // We know for sure that the parser needs to be using UTF16.
       parserCharset = "UTF-16";
       parserCharsetSource = charsetSource < kCharsetFromChannel ?
         kCharsetFromChannel : charsetSource;
         
--- a/content/html/document/src/nsHTMLDocument.h
+++ b/content/html/document/src/nsHTMLDocument.h
@@ -308,16 +308,17 @@ protected:
                             nsIDocShell*  aDocShell,
                             int32_t& aCharsetSource,
                             nsACString& aCharset);
   static void TryCacheCharset(nsICachingChannel* aCachingChannel,
                                 int32_t& aCharsetSource,
                                 nsACString& aCharset);
   void TryParentCharset(nsIDocShell*  aDocShell,
                         int32_t& charsetSource, nsACString& aCharset);
+  void TryTLD(int32_t& aCharsetSource, nsACString& aCharset);
   static void TryFallback(int32_t& aCharsetSource, nsACString& aCharset);
 
   // Override so we can munge the charset on our wyciwyg channel as needed.
   virtual void SetDocumentCharacterSet(const nsACString& aCharSetID) MOZ_OVERRIDE;
 
   // Tracks if we are currently processing any document.write calls (either
   // implicit or explicit). Note that if a write call writes out something which
   // would block the parser, then mWriteLevel will be incorrect until the parser
--- a/docshell/base/nsDocShell.cpp
+++ b/docshell/base/nsDocShell.cpp
@@ -1989,16 +1989,20 @@ nsDocShell::GatherCharsetMenuTelemetry()
   bool isFileURL = false;
   nsIURI* url = doc->GetOriginalURI();
   if (url) {
     url->SchemeIs("file", &isFileURL);
   }
 
   int32_t charsetSource = doc->GetDocumentCharacterSetSource();
   switch (charsetSource) {
+    case kCharsetFromTopLevelDomain:
+      // Unlabeled doc on a domain that we map to a fallback encoding
+      Telemetry::Accumulate(Telemetry::CHARSET_OVERRIDE_SITUATION, 7);
+      break;
     case kCharsetFromFallback:
     case kCharsetFromDocTypeDefault:
     case kCharsetFromCache:
     case kCharsetFromParentFrame:
     case kCharsetFromHintPrevDoc:
       // Changing charset on an unlabeled doc.
       if (isFileURL) {
         Telemetry::Accumulate(Telemetry::CHARSET_OVERRIDE_SITUATION, 0);
--- a/dom/encoding/FallbackEncoding.cpp
+++ b/dom/encoding/FallbackEncoding.cpp
@@ -12,17 +12,26 @@
 
 namespace mozilla {
 namespace dom {
 
 static const char* localesFallbacks[][3] = {
 #include "localesfallbacks.properties.h"
 };
 
+static const char* domainsFallbacks[][3] = {
+#include "domainsfallbacks.properties.h"
+};
+
+static const char* nonParticipatingDomains[][3] = {
+#include "nonparticipatingdomains.properties.h"
+};
+
 FallbackEncoding* FallbackEncoding::sInstance = nullptr;
+bool FallbackEncoding::sGuessFallbackFromTopLevelDomain = true;
 
 FallbackEncoding::FallbackEncoding()
 {
   MOZ_COUNT_CTOR(FallbackEncoding);
   MOZ_ASSERT(!FallbackEncoding::sInstance,
              "Singleton already exists.");
 }
 
@@ -116,21 +125,44 @@ FallbackEncoding::Initialize()
              "Initializing pre-existing fallback cache.");
   FallbackEncoding::sInstance = new FallbackEncoding;
   Preferences::RegisterCallback(FallbackEncoding::PrefChanged,
                                 "intl.charset.fallback.override",
                                 nullptr);
   Preferences::RegisterCallback(FallbackEncoding::PrefChanged,
                                 "general.useragent.locale",
                                 nullptr);
+  Preferences::AddBoolVarCache(&sGuessFallbackFromTopLevelDomain,
+                               "intl.charset.fallback.tld");
 }
 
 void
 FallbackEncoding::Shutdown()
 {
   MOZ_ASSERT(FallbackEncoding::sInstance,
              "Releasing non-existent fallback cache.");
   delete FallbackEncoding::sInstance;
   FallbackEncoding::sInstance = nullptr;
 }
 
+bool
+FallbackEncoding::IsParticipatingTopLevelDomain(const nsACString& aTLD)
+{
+  nsAutoCString dummy;
+  return NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
+      nonParticipatingDomains,
+      ArrayLength(nonParticipatingDomains),
+      aTLD,
+      dummy));
+}
+
+void
+FallbackEncoding::FromTopLevelDomain(const nsACString& aTLD,
+                                     nsACString& aFallback)
+{
+  if (NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
+      domainsFallbacks, ArrayLength(domainsFallbacks), aTLD, aFallback))) {
+    aFallback.AssignLiteral("windows-1252");
+  }
+}
+
 } // namespace dom
 } // namespace mozilla
--- a/dom/encoding/FallbackEncoding.h
+++ b/dom/encoding/FallbackEncoding.h
@@ -10,23 +10,45 @@
 namespace mozilla {
 namespace dom {
 
 class FallbackEncoding
 {
 public:
 
   /**
+   * Whether FromTopLevelDomain() should be used.
+   */
+  static bool sGuessFallbackFromTopLevelDomain;
+
+  /**
    * Gets the locale-dependent fallback encoding for legacy HTML and plain
    * text content.
    *
    * @param aFallback the outparam for the fallback encoding
    */
   static void FromLocale(nsACString& aFallback);
 
+  /**
+   * Checks if it is appropriate to call FromTopLevelDomain() for a given TLD.
+   *
+   * @param aTLD the top-level domain (in Punycode)
+   * @return true if OK to call FromTopLevelDomain()
+   */
+  static bool IsParticipatingTopLevelDomain(const nsACString& aTLD);
+
+  /**
+   * Gets a top-level domain-depedendent fallback encoding for legacy HTML
+   * and plain text content
+   *
+   * @param aTLD the top-level domain (in Punycode)
+   * @param aFallback the outparam for the fallback encoding
+   */
+  static void FromTopLevelDomain(const nsACString& aTLD, nsACString& aFallback);
+
   // public API ends here!
 
   /**
    * Allocate sInstance used by FromLocale().
    * To be called from nsLayoutStatics only.
    */
   static void Initialize();
 
--- a/dom/encoding/Makefile.in
+++ b/dom/encoding/Makefile.in
@@ -4,8 +4,12 @@
 
 include $(topsrcdir)/config/rules.mk
 
 PROPS2ARRAYS = $(topsrcdir)/intl/locale/src/props2arrays.py
 labelsencodings.properties.h: $(PROPS2ARRAYS) labelsencodings.properties
 	$(PYTHON) $^ $@
 localesfallbacks.properties.h: $(PROPS2ARRAYS) localesfallbacks.properties
 	$(PYTHON) $^ $@
+domainsfallbacks.properties.h: $(PROPS2ARRAYS) domainsfallbacks.properties
+	$(PYTHON) $^ $@
+nonparticipatingdomains.properties.h: $(PROPS2ARRAYS) nonparticipatingdomains.properties
+	$(PYTHON) $^ $@
new file mode 100644
--- /dev/null
+++ b/dom/encoding/domainsfallbacks.properties
@@ -0,0 +1,167 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# This file contains educated guesses about which top-level domains are
+# likely to host legacy content that assumes a non-windows-1252 encoding.
+# Punycode TLDs are included on the theory that legacy content might appear
+# behind those relatively new TLDs if DNS just points to a legacy server.
+#
+# Encodings for which a confident-enough educated guess is missing are
+# listed in nonparticipatingdomains.properties. Domains that are listed 
+# neither there nor here get windows-1252 as the associated fallback.
+#
+# The list below includes Arabic-script TLDs not on IANA list but on the 
+# ICANN list:
+# http://www.icann.org/en/resources/idn/fast-track/string-evaluation-completion
+# Otherwise, the list includes non-windows-1252-affilited country TLDs from
+# https://data.iana.org/TLD/tlds-alpha-by-domain.txt
+#
+# The guesses are assigned as follows:
+# * If the country has a dominant country-affiliated language and that language
+#   is part of the languages to fallbacks mapping, use the encoding for that
+#   language from that mapping.
+# * Use windows-1256 for countries that have a dominant Arabic-script
+#   language or whose all languages are Arabic-script languages.
+# * Use windows-1251 likewise but for Cyrillic script.
+
+ae=windows-1256
+xn--mgbaam7a8h=windows-1256
+
+af=windows-1256
+
+bg=windows-1251
+
+bh=windows-1256
+
+by=windows-1251
+
+cn=gbk
+xn--fiqs8s=gbk
+# Assume that Traditional Chinese TLD is meant to work if URL input happens to 
+# be in the traditional mode. Expect content to be simplified anyway.
+xn--fiqz9s=gbk
+
+cz=windows-1250
+
+dz=windows-1256
+xn--lgbbat1ad8j=windows-1256
+
+ee=windows-1257
+
+eg=windows-1256
+xn--wgbh1c=windows-1256
+
+gr=ISO-8859-7
+
+hk=Big5-HKSCS
+xn--j6w193g=Big5-HKSCS
+
+hr=windows-1250
+
+hu=ISO-8859-2
+
+iq=windows-1256
+
+ir=windows-1256
+xn--mgba3a4f16a=windows-1256
+
+jo=windows-1256
+xn--mgbayh7gpa=windows-1256
+
+jp=Shift_JIS
+
+kg=windows-1251
+
+kp=EUC-KR
+
+kr=EUC-KR
+xn--3e0b707e=EUC-KR
+
+kw=windows-1256
+
+kz=windows-1251
+xn--80ao21a=windows-1251
+
+lb=windows-1256
+
+lt=windows-1257
+
+lv=windows-1257
+
+ma=windows-1256
+xn--mgbc0a9azcg=windows-1256
+
+mk=windows-1251
+
+mn=windows-1251
+xn--l1acc=windows-1251
+
+mo=Big5
+
+# my
+xn--mgbx4cd0ab=windows-1256
+
+om=windows-1256
+xn--mgb9awbf=windows-1256
+
+#pk
+xn--mgbai9azgqp6j=windows-1256
+
+pl=ISO-8859-2
+
+ps=windows-1256
+xn--ygbi2ammx=windows-1256
+
+qa=windows-1256
+xn--wgbl6a=windows-1256
+
+rs=windows-1251
+xn--90a3ac=windows-1251
+
+ru=windows-1251
+xn--p1ai=windows-1251
+
+sa=windows-1256
+xn--mgberp4a5d4ar=windows-1256
+
+sd=windows-1256
+xn--mgbpl2fh=windows-1256
+
+sg=gbk
+xn--yfro4i67o=gbk
+
+si=ISO-8859-2
+
+sk=windows-1250
+
+su=windows-1251
+
+sy=windows-1256
+xn--mgbtf8fl=windows-1256
+
+th=windows-874
+xn--o3cw4h=windows-874
+
+tj=windows-1251
+
+tn=windows-1256
+xn--pgbs0dh=windows-1256
+
+tr=windows-1254
+
+tw=Big5
+# Assume that the Simplified Chinese TLD is meant to work when URL input
+# happens in the simplified mode. Assume content is tradition anyway.
+xn--kprw13d=Big5
+xn--kpry57d=Big5
+
+ua=windows-1251
+xn--j1amh=windows-1251
+
+uz=windows-1251
+
+vn=windows-1258
+
+ye=windows-1256
+xn--mgb2ddes=windows-1256
--- a/dom/encoding/moz.build
+++ b/dom/encoding/moz.build
@@ -23,11 +23,13 @@ UNIFIED_SOURCES += [
 FAIL_ON_WARNINGS = True
 
 FINAL_LIBRARY = 'gklayout'
 LOCAL_INCLUDES += [
     '/intl/locale/src',
 ]
 
 GENERATED_FILES += [
+    'domainsfallbacks.properties.h',
     'labelsencodings.properties.h',
     'localesfallbacks.properties.h',
+    'nonparticipatingdomains.properties.h',
 ]
new file mode 100644
--- /dev/null
+++ b/dom/encoding/nonparticipatingdomains.properties
@@ -0,0 +1,51 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# Top-level domains listed here do not participate in TLD-based guessing.
+#
+# We should do Web crawls to see if domains listed here can migrate to
+# domainsfallbacks.properties.
+#
+# The value to the right of the = sign is ignored and serves as a placeholder.
+
+# Generic
+com=windows-1252
+net=windows-1252
+org=windows-1252
+
+# No Firefox localization for Azeri
+az=windows-1254
+
+# windows-1251 or windows-1250?
+ba=???
+
+# ISO-8859-7 or windows-1254?
+cy=???
+
+# Is there enough unlabeled windows-1256 content for a windows-1255 to break
+# too much?
+il=windows-1255
+
+# Out-of-country English use
+ly=windows-1256
+
+# Out-of-country English use
+# md=windows-1250
+
+# Out-of-country English use
+# me=windows-1251
+
+# Malaysia has an Arabic-script TLD, official script is latin, possibly Chinese-script publications
+my=???
+
+# No Firefox localization for Urdu; potential for minority-language sites
+# relying on windows-1252 hacks.
+pk=windows-1256
+
+# The Romanian localization says windows-1252, even though the Windows legacy
+# differs.
+ro=windows-1250
+
+tm=windows-1250
+
new file mode 100644
--- /dev/null
+++ b/dom/encoding/test/file_TLD.html
@@ -0,0 +1,7 @@
+<!DOCTYPE html>
+<script>
+function report() {
+  window.parent.postMessage(document.characterSet, "*");
+}
+</script>
+<body onload="report();">
--- a/dom/encoding/test/mochitest.ini
+++ b/dom/encoding/test/mochitest.ini
@@ -2,18 +2,20 @@
 support-files =
   file_utf16_be_bom.css
   file_utf16_be_bom.js
   file_utf16_be_bom.xhtml
   file_utf16_le_bom.css
   file_utf16_le_bom.js
   file_utf16_le_bom.xhtml
   file_utf16_le_nobom.xhtml
+  file_TLD.html
   worker_helper.js
 
 [test_BOMEncoding.js]
 [test_TextDecoder.html]
 [test_TextDecoder.js]
 [test_TextEncoder.html]
 [test_TextEncoder.js]
 [test_stringencoding.html]
 [test_submit_euckr.html]
+[test_TLD.html]
 [test_utf16_files.html]
new file mode 100644
--- /dev/null
+++ b/dom/encoding/test/test_TLD.html
@@ -0,0 +1,57 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=910211
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Bug 910211</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+  <script type="application/javascript">
+
+  /** Test for Bug 910211 **/
+
+  SimpleTest.waitForExplicitFinish();
+
+  var tlds = [
+    {'tld': 'tw', 'encoding': 'Big5'},
+    {'tld': 'cn', 'encoding': 'gbk'},
+    {'tld': 'co.jp', 'encoding': 'Shift_JIS'},
+    {'tld': 'fi', 'encoding': 'windows-1252'},
+  ];
+
+  var iframe = null;
+
+  var current = null;
+
+  function runTest() {
+    iframe = document.getElementsByTagName("iframe")[0];
+    window.addEventListener("message", next);
+    next(null);
+  }
+
+  function next(event) {
+    if (event) {
+      is(event.data, current['encoding'], "Got bad encoding for " + current["tld"]);
+    }
+    current = tlds.shift();
+    if (!current) {
+      SimpleTest.finish();
+      return;
+    }
+    iframe.src = "http://example." + current["tld"] + "/tests/dom/encoding/test/file_TLD.html";
+  }
+
+  </script>
+</head>
+<body onload="runTest();">
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=910211">Mozilla Bug 910211</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+<iframe></iframe>
+</div>
+<pre id="test">
+</pre>
+</body>
+</html>
--- a/modules/libpref/src/init/all.js
+++ b/modules/libpref/src/init/all.js
@@ -1387,16 +1387,17 @@ pref("intl.charsetmenu.browser.more4",  
 pref("intl.charsetmenu.browser.more5",      "ISO-8859-6, windows-1256, ISO-8859-8-I, windows-1255, ISO-8859-8, IBM862");
 pref("intl.charsetmenu.mailedit",           "chrome://global/locale/intl.properties");
 pref("intl.charsetmenu.browser.cache",      "");
 pref("intl.charsetmenu.mailview.cache",     "");
 pref("intl.charsetmenu.composer.cache",     "");
 pref("intl.charsetmenu.browser.cache.size", 5);
 pref("intl.charset.detector",               "chrome://global/locale/intl.properties");
 pref("intl.charset.fallback.override",      "");
+pref("intl.charset.fallback.tld",           true);
 pref("intl.ellipsis",                       "chrome://global-platform/locale/intl.properties");
 pref("intl.locale.matchOS",                 false);
 // fallback charset list for Unicode conversion (converting from Unicode)
 // currently used for mail send only to handle symbol characters (e.g Euro, trademark, smartquotes)
 // for ISO-8859-1
 pref("intl.fallbackCharsetList.ISO-8859-1", "windows-1252");
 pref("font.language.group",                 "chrome://global/locale/intl.properties");
 
--- a/parser/nsCharsetSource.h
+++ b/parser/nsCharsetSource.h
@@ -3,23 +3,24 @@
  * You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef nsCharsetSource_h_
 #define nsCharsetSource_h_
 
 // note: the value order defines the priority; higher numbers take priority
 #define kCharsetUninitialized           0
 #define kCharsetFromFallback            1
-#define kCharsetFromDocTypeDefault      2 // This and up confident for XHR
-#define kCharsetFromCache               3
-#define kCharsetFromParentFrame         4
-#define kCharsetFromAutoDetection       5
-#define kCharsetFromHintPrevDoc         6
-#define kCharsetFromMetaPrescan         7 // this one and smaller: HTML5 Tentative
-#define kCharsetFromMetaTag             8 // this one and greater: HTML5 Confident
-#define kCharsetFromIrreversibleAutoDetection 9
-#define kCharsetFromChannel            10
-#define kCharsetFromOtherComponent     11
-#define kCharsetFromParentForced       12 // propagates to child frames
-#define kCharsetFromUserForced         13 // propagates to child frames
-#define kCharsetFromByteOrderMark      14
+#define kCharsetFromTopLevelDomain      2
+#define kCharsetFromDocTypeDefault      3 // This and up confident for XHR
+#define kCharsetFromCache               4
+#define kCharsetFromParentFrame         5
+#define kCharsetFromAutoDetection       6
+#define kCharsetFromHintPrevDoc         7
+#define kCharsetFromMetaPrescan         8 // this one and smaller: HTML5 Tentative
+#define kCharsetFromMetaTag             9 // this one and greater: HTML5 Confident
+#define kCharsetFromIrreversibleAutoDetection 10
+#define kCharsetFromChannel            11
+#define kCharsetFromOtherComponent     12
+#define kCharsetFromParentForced       13 // propagates to child frames
+#define kCharsetFromUserForced         14 // propagates to child frames
+#define kCharsetFromByteOrderMark      15
 
 #endif /* nsCharsetSource_h_ */