Bug 801402 - Use FindEncodingForLabel from HTML parser. r=hsivonen
authorMasatoshi Kimura <VYV03354@nifty.ne.jp>
Wed, 07 Nov 2012 18:04:22 -0500
changeset 112628 eb7d1fd8a86863057863c4bd2c971a11e90fcdf0
parent 112627 c0af6d983c0ea12ba4d78a6c408cb75fe38282bb
child 112629 91879bfc7042f9f9728349b4e7e8c28f1f94b2bf
push id23833
push useremorley@mozilla.com
push dateThu, 08 Nov 2012 10:20:57 +0000
treeherdermozilla-central@e0d7b394462b [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewershsivonen
bugs801402
milestone19.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 801402 - Use FindEncodingForLabel from HTML parser. r=hsivonen
dom/locales/en-US/chrome/layout/htmlparser.properties
parser/html/nsHtml5MetaScannerCppSupplement.h
parser/html/nsHtml5StreamParser.cpp
parser/htmlparser/src/nsParser.cpp
parser/htmlparser/src/nsScanner.cpp
--- a/dom/locales/en-US/chrome/layout/htmlparser.properties
+++ b/dom/locales/en-US/chrome/layout/htmlparser.properties
@@ -9,17 +9,16 @@ EncNoDeclaration=The character encoding 
 EncLateMetaFrame=The character encoding declaration of the framed HTML document was not found when prescanning the first 1024 bytes of the file. When viewed without the document framing it, the page will reload automatically. The encoding declaration needs to be moved to be within the first 1024 bytes of the file.
 EncLateMeta=The character encoding declaration of the HTML document was not found when prescanning the first 1024 bytes of the file. When viewed in a differently-configured browser, this page will reload automatically. The encoding declaration needs to be moved to be within the first 1024 bytes of the file.
 EncLateMetaReload=The page was reloaded, because the character encoding declaration of the HTML document was not found when prescanning the first 1024 bytes of the file. The encoding declaration needs to be moved to be within the first 1024 bytes of the file.
 EncLateMetaTooLate=The character encoding declaration of document was found too late for it to take effect. The encoding declaration needs to be moved to be within the first 1024 bytes of the file.
 EncMetaUnsupported=An unsupported character encoding was declared for the HTML document using a meta tag. The declaration was ignored.
 EncProtocolUnsupported=An unsupported character encoding was declared on the transfer protocol level. The declaration was ignored.
 EncBomlessUtf16=Detected UTF-16-encoded Basic Latin-only text without a byte order mark and without a transfer protocol-level declaration. Encoding this content in UTF-16 is inefficient and the character encoding should have been declared in any case.
 EncMetaUtf16=A meta tag was used to declare the character encoding as UTF-16. This was interpreted as an UTF-8 declaration instead.
-EncMetaNonRoughSuperset=A meta tag was used to declare a character encoding the does not encode the Basic Latin range roughly like US-ASCII. The declaration was ignored.
 
 # The bulk of the messages below are derived from 
 # http://hg.mozilla.org/projects/htmlparser/file/1f633cef7de7/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java
 # which is available under the MIT license.
 
 # Tokenizer errors
 errGarbageAfterLtSlash=Garbage after “</”.
 errLtSlashGt=Saw “</>”. Probable causes: Unescaped “<” (escape as “&lt;”) or mistyped end tag.
--- a/parser/html/nsHtml5MetaScannerCppSupplement.h
+++ b/parser/html/nsHtml5MetaScannerCppSupplement.h
@@ -1,18 +1,20 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  
 #include "nsICharsetConverterManager.h"
 #include "nsServiceManagerUtils.h"
-#include "nsCharsetAlias.h"
 #include "nsEncoderDecoderUtils.h"
 #include "nsTraceRefcnt.h"
 
+#include "mozilla/dom/EncodingUtils.h"
+
+using mozilla::dom::EncodingUtils;
 
 void
 nsHtml5MetaScanner::sniff(nsHtml5ByteReadable* bytes, nsIUnicodeDecoder** decoder, nsACString& charset)
 {
   readable = bytes;
   stateLoop(stateSave);
   readable = nullptr;
   if (mUnicodeDecoder) {
@@ -43,18 +45,17 @@ nsHtml5MetaScanner::tryCharset(nsString*
     res = convManager->GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
     if (NS_FAILED(res)) {
       NS_ERROR("Could not get decoder for UTF-8.");
       return false;
     }
     return true;
   }
   nsAutoCString preferred;
-  res = nsCharsetAlias::GetPreferred(encoding, preferred);
-  if (NS_FAILED(res)) {
+  if (!EncodingUtils::FindEncodingForLabel(encoding, preferred)) {
     return false;
   }
   if (preferred.LowerCaseEqualsLiteral("utf-16") ||
       preferred.LowerCaseEqualsLiteral("utf-16be") ||
       preferred.LowerCaseEqualsLiteral("utf-16le") ||
       preferred.LowerCaseEqualsLiteral("utf-7") ||
       preferred.LowerCaseEqualsLiteral("x-imap4-modified-utf7")) {
     return false;
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@@ -1,17 +1,16 @@
 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* vim: set sw=2 ts=2 et tw=79: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "nsHtml5StreamParser.h"
 #include "nsICharsetConverterManager.h"
-#include "nsCharsetAlias.h"
 #include "nsServiceManagerUtils.h"
 #include "nsEncoderDecoderUtils.h"
 #include "nsContentUtils.h"
 #include "nsHtml5Tokenizer.h"
 #include "nsIHttpChannel.h"
 #include "nsHtml5Parser.h"
 #include "nsHtml5TreeBuilder.h"
 #include "nsHtml5AtomTable.h"
@@ -21,18 +20,20 @@
 #include "mozilla/Preferences.h"
 #include "nsHtml5Highlighter.h"
 #include "expat_config.h"
 #include "expat.h"
 #include "nsINestedURI.h"
 #include "nsCharsetSource.h"
 #include "nsIWyciwygChannel.h"
 
+#include "mozilla/dom/EncodingUtils.h"
+
 using namespace mozilla;
-
+using mozilla::dom::EncodingUtils;
 
 int32_t nsHtml5StreamParser::sTimerInitialDelay = 120;
 int32_t nsHtml5StreamParser::sTimerSubsequentDelay = 120;
 
 // static
 void
 nsHtml5StreamParser::InitializeStatics()
 {
@@ -1188,84 +1189,52 @@ nsHtml5StreamParser::OnDataAvailable(nsI
     NS_WARNING("Dispatching DataAvailable event failed.");
   }
   return rv;
 }
 
 bool
 nsHtml5StreamParser::PreferredForInternalEncodingDecl(nsACString& aEncoding)
 {
-  nsAutoCString newEncoding(aEncoding);
-  newEncoding.Trim(" \t\r\n\f");
-  if (newEncoding.LowerCaseEqualsLiteral("utf-16") ||
-      newEncoding.LowerCaseEqualsLiteral("utf-16be") ||
-      newEncoding.LowerCaseEqualsLiteral("utf-16le")) {
+  nsAutoCString newEncoding;
+  if (!EncodingUtils::FindEncodingForLabel(aEncoding, newEncoding)) {
+    // the encoding name is bogus
+    mTreeBuilder->MaybeComplainAboutCharset("EncMetaUnsupported",
+                                            true,
+                                            mTokenizer->getLineNumber());
+    return false;
+  }
+
+  if (newEncoding.EqualsLiteral("UTF-16") ||
+      newEncoding.EqualsLiteral("UTF-16BE") ||
+      newEncoding.EqualsLiteral("UTF-16LE")) {
     mTreeBuilder->MaybeComplainAboutCharset("EncMetaUtf16",
                                             true,
                                             mTokenizer->getLineNumber());
     newEncoding.Assign("UTF-8");
   }
 
-  nsresult rv = NS_OK;
-  bool eq;
-  rv = nsCharsetAlias::Equals(newEncoding, mCharset, &eq);
-  if (NS_FAILED(rv)) {
-    // the encoding name is bogus
-    mTreeBuilder->MaybeComplainAboutCharset("EncMetaUnsupported",
-                                            true,
-                                            mTokenizer->getLineNumber());
-    return false;
-  }
-  if (eq) {
+  if (newEncoding.Equals(mCharset)) {
     if (mCharsetSource < kCharsetFromMetaPrescan) {
       if (mInitialEncodingWasFromParentFrame) {
         mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaFrame",
                                                 false,
                                                 mTokenizer->getLineNumber());
       } else {
         mTreeBuilder->MaybeComplainAboutCharset("EncLateMeta",
                                                 false,
                                                 mTokenizer->getLineNumber());
       }
     }
     mCharsetSource = kCharsetFromMetaTag; // become confident
     mFeedChardet = false; // don't feed chardet when confident
     return false;
   }
 
-  // XXX check HTML5 non-IANA aliases here
-
-  nsAutoCString preferred;
-  rv = nsCharsetAlias::GetPreferred(newEncoding, preferred);
-  if (NS_FAILED(rv)) {
-    // This charset has been blacklisted for permitting XSS smuggling.
-    // EncMetaNonRoughSuperset is a reasonable approximation to the
-    // right error message.
-    mTreeBuilder->MaybeComplainAboutCharset("EncMetaNonRoughSuperset",
-                                            true,
-                                            mTokenizer->getLineNumber());
-    return false;
-  }
-
-  // ??? Explicit further blacklist of character sets that are not
-  // "rough supersets" of ASCII.  Some of these are handled above (utf-16),
-  // some by the XSS smuggling blacklist in charsetData.properties,
-  // maybe all of the remainder should also be blacklisted there.
-  if (preferred.LowerCaseEqualsLiteral("utf-16") ||
-      preferred.LowerCaseEqualsLiteral("utf-16be") ||
-      preferred.LowerCaseEqualsLiteral("utf-16le") ||
-      preferred.LowerCaseEqualsLiteral("utf-7") ||
-      preferred.LowerCaseEqualsLiteral("x-imap4-modified-utf7")) {
-    // Not a rough ASCII superset
-    mTreeBuilder->MaybeComplainAboutCharset("EncMetaNonRoughSuperset",
-                                            true,
-                                            mTokenizer->getLineNumber());
-    return false;
-  }
-  aEncoding.Assign(preferred);
+  aEncoding.Assign(newEncoding);
   return true;
 }
 
 bool
 nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
 {
   // This code needs to stay in sync with
   // nsHtml5MetaScanner::tryCharset. Unfortunately, the
--- a/parser/htmlparser/src/nsParser.cpp
+++ b/parser/htmlparser/src/nsParser.cpp
@@ -9,17 +9,16 @@
 #include "nsString.h"
 #include "nsCRT.h"
 #include "nsScanner.h"
 #include "plstr.h"
 #include "nsIStringStream.h"
 #include "nsIChannel.h"
 #include "nsICachingChannel.h"
 #include "nsICacheEntryDescriptor.h"
-#include "nsCharsetAlias.h"
 #include "nsICharsetConverterManager.h"
 #include "nsIInputStream.h"
 #include "CNavDTD.h"
 #include "prenv.h"
 #include "prlock.h"
 #include "prcvar.h"
 #include "nsParserCIID.h"
 #include "nsReadableUtils.h"
@@ -38,17 +37,20 @@
 #include "nsXPCOMCIDInternal.h"
 #include "nsMimeTypes.h"
 #include "mozilla/CondVar.h"
 #include "mozilla/Mutex.h"
 #include "nsParserConstants.h"
 #include "nsCharsetSource.h"
 #include "nsContentUtils.h"
 
+#include "mozilla/dom/EncodingUtils.h"
+
 using namespace mozilla;
+using mozilla::dom::EncodingUtils;
 
 #define NS_PARSER_FLAG_PARSER_ENABLED         0x00000002
 #define NS_PARSER_FLAG_OBSERVERS_ENABLED      0x00000004
 #define NS_PARSER_FLAG_PENDING_CONTINUE_EVENT 0x00000008
 #define NS_PARSER_FLAG_FLUSH_TOKENS           0x00000020
 #define NS_PARSER_FLAG_CAN_TOKENIZE           0x00000040
 
 static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
@@ -1835,18 +1837,17 @@ ParserWriteFunc(nsIInputStream* in,
       // The decoder will swallow the BOM. The UTF-16 will re-sniff for
       // endianness. The value of preferred is now either "UTF-8" or "UTF-16".
       preferred.Assign(maybePrefer);
       source = kCharsetFromByteOrderMark;
     } else if (source < kCharsetFromChannel) {
       nsAutoCString declCharset;
 
       if (ExtractCharsetFromXmlDeclaration(buf, count, declCharset)) {
-        nsresult rv = nsCharsetAlias::GetPreferred(declCharset, maybePrefer);
-        if (NS_SUCCEEDED(rv)) {
+        if (EncodingUtils::FindEncodingForLabel(declCharset, maybePrefer)) {
           preferred.Assign(maybePrefer);
           source = kCharsetFromMetaTag;
         }
       }
     }
 
     pws->mParser->SetDocumentCharset(preferred, source);
     pws->mParser->SetSinkCharset(preferred);
--- a/parser/htmlparser/src/nsScanner.cpp
+++ b/parser/htmlparser/src/nsScanner.cpp
@@ -5,26 +5,29 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 //#define __INCREMENTAL 1
 
 #include "nsScanner.h"
 #include "nsDebug.h"
 #include "nsIServiceManager.h"
 #include "nsICharsetConverterManager.h"
-#include "nsCharsetAlias.h"
 #include "nsReadableUtils.h"
 #include "nsIInputStream.h"
 #include "nsIFile.h"
 #include "nsNetUtil.h"
 #include "nsUTF8Utils.h" // for LossyConvertEncoding
 #include "nsCRT.h"
 #include "nsParser.h"
 #include "nsCharsetSource.h"
 
+#include "mozilla/dom/EncodingUtils.h"
+
+using mozilla::dom::EncodingUtils;
+
 // We replace NUL characters with this character.
 static PRUnichar sInvalid = UCS2_REPLACEMENT_CHAR;
 
 nsReadEndCondition::nsReadEndCondition(const PRUnichar* aTerminateChars) :
   mChars(aTerminateChars), mFilter(PRUnichar(~0)) // All bits set
 {
   // Build filter that will be used to filter out characters with
   // bits that none of the terminal chars have. This works very well
@@ -113,41 +116,38 @@ nsScanner::nsScanner(nsString& aFilename
   SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault);
 }
 
 nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource)
 {
   if (aSource < mCharsetSource) // priority is lower the the current one , just
     return NS_OK;
 
-  nsresult res = NS_OK;
+  nsCString charsetName;
+  bool valid = EncodingUtils::FindEncodingForLabel(aCharset, charsetName);
+  MOZ_ASSERT(valid, "Should never call with a bogus aCharset.");
   if (!mCharset.IsEmpty())
   {
-    bool same;
-    res = nsCharsetAlias::Equals(aCharset, mCharset, &same);
-    if(NS_SUCCEEDED(res) && same)
+    if (charsetName.Equals(mCharset))
     {
       mCharsetSource = aSource;
       return NS_OK; // no difference, don't change it
     }
   }
 
   // different, need to change it
-  nsCString charsetName;
-  res = nsCharsetAlias::GetPreferred(aCharset, charsetName);
-  MOZ_ASSERT(NS_SUCCEEDED(res), "Should never call with a bogus aCharset.");
 
   mCharset.Assign(charsetName);
 
   mCharsetSource = aSource;
 
   NS_ASSERTION(nsParser::GetCharsetConverterManager(),
                "Must have the charset converter manager!");
 
-  res = nsParser::GetCharsetConverterManager()->
+  nsresult res = nsParser::GetCharsetConverterManager()->
     GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
   if (NS_SUCCEEDED(res) && mUnicodeDecoder)
   {
      // We need to detect conversion error of character to support XML
      // encoding error.
      mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);
   }