Bug 796882 - Implement CSS charset handling according to CSS3 Syntax. r=bzbarsky.
authorHenri Sivonen <hsivonen@iki.fi>
Thu, 15 Nov 2012 10:47:30 +0200
changeset 113365 b301f9b2e95636e700a2e9e1d65002142b66069e
parent 113364 c6b8170e901381bf352a4bf20b91c4ab6160210b
child 113366 a9500b386854cf11cf9ffed05dc1921386bdb61d
push id23869
push useremorley@mozilla.com
push dateThu, 15 Nov 2012 16:18:16 +0000
treeherdermozilla-central@a37525d304d9 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbzbarsky
bugs796882
milestone19.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 796882 - Implement CSS charset handling according to CSS3 Syntax. r=bzbarsky.
layout/reftests/css-charset/test-charset-utf-16-be-no-bom.css
layout/reftests/css-charset/test-charset-utf-16-be-no-bom.html
layout/reftests/css-charset/test-charset-utf-16-le-no-bom.css
layout/reftests/css-charset/test-charset-utf-16-le-no-bom.html
layout/style/Loader.cpp
index 7218603e6a30694772d53f1ca4e1410ffa06d05c..f877df030cec5a8afb5609441c439242008151e8
GIT binary patch
literal 82
zc${N!U`S@jU`S*rVkl-vWhh}#U{GQRWe8z#W6)(VWH4iJVsHhDSTk@jBr)VOq%c$h
bRa66UGLV-8#6=8NKwc40ZwgSA0z)kT3GNKj
--- a/layout/reftests/css-charset/test-charset-utf-16-be-no-bom.html
+++ b/layout/reftests/css-charset/test-charset-utf-16-be-no-bom.html
@@ -1,18 +1,19 @@
 <!DOCTYPE html>
 <html>
   <head>
-    <title>CSS 2.1 Test Suite: @charset</title>
+    <title>CSS 2.1 Test Suite: @charset, modified for CSS3</title>
     <link rel="author" title="Boris Zbarsky" href="mailto:bzbarsky@mit.edu" />
+    <link rel="author" title="Henri Sivonen" href="mailto:hsivonen@iki.fi" />
     <link rel="author" title="Mozilla Corporation" href="http://mozilla.com/" />
     <link rel="help" href="http://www.w3.org/TR/CSS21/syndata.html#charset"/>
     <meta name="flags" content="" />
     <style type="text/css">
-      body { color: red; }
+      body { color: green; }
     </style>
     <link rel="stylesheet" type="text/css" charset="us-ascii"
           href="test-charset-utf-16-be-no-bom.css" />
   </head>
   <body>
     This should be green
   </body>
 </html>
index 0da429cfd5a5dbf1da8e7c00b3c8740ca4f03489..b2a1129c0429b85cf06c09495272bda041a07699
GIT binary patch
literal 82
zc${-!NM^`jNMtBtC}v1yC}B`wP+|yW2w`wz&}A@WFk|pxa0QB3GjK5^G2}C(FjN9n
bR0DA`ke36*MGRIzUJ+1l3Q&~-LoEXU7_<!C
--- a/layout/reftests/css-charset/test-charset-utf-16-le-no-bom.html
+++ b/layout/reftests/css-charset/test-charset-utf-16-le-no-bom.html
@@ -1,18 +1,19 @@
 <!DOCTYPE html>
 <html>
   <head>
-    <title>CSS 2.1 Test Suite: @charset</title>
+    <title>CSS 2.1 Test Suite: @charset, modified for CSS3</title>
     <link rel="author" title="Boris Zbarsky" href="mailto:bzbarsky@mit.edu" />
+    <link rel="author" title="Henri Sivonen" href="mailto:hsivonen@iki.fi" />
     <link rel="author" title="Mozilla Corporation" href="http://mozilla.com/" />
     <link rel="help" href="http://www.w3.org/TR/CSS21/syndata.html#charset"/>
     <meta name="flags" content="" />
     <style type="text/css">
-      body { color: red; }
+      body { color: green; }
     </style>
     <link rel="stylesheet" type="text/css" charset="us-ascii"
           href="test-charset-utf-16-le-no-bom.css" />
   </head>
   <body>
     This should be green
   </body>
 </html>
--- a/layout/style/Loader.cpp
+++ b/layout/style/Loader.cpp
@@ -61,16 +61,18 @@
 #include "nsIDOMStyleSheet.h"
 #include "nsIDOMCSSStyleSheet.h"
 #include "nsError.h"
 
 #include "nsIChannelPolicy.h"
 #include "nsIContentSecurityPolicy.h"
 #include "nsCycleCollectionParticipant.h"
 
+#include "mozilla/dom/EncodingUtils.h"
+using mozilla::dom::EncodingUtils;
 
 /**
  * OVERALL ARCHITECTURE
  *
  * The CSS Loader gets requests to load various sorts of style sheets:
  * inline style from <style> elements, linked style, @import-ed child
  * sheets, non-document sheets.  The loader handles the following tasks:
  *
@@ -605,198 +607,175 @@ Loader::SetPreferredSheet(const nsAStrin
     }
   }
 
   return NS_OK;
 }
 
 static const char kCharsetSym[] = "@charset \"";
 
-static nsresult GetCharsetFromData(const unsigned char* aStyleSheetData,
-                                   uint32_t aDataLength,
-                                   nsACString& aCharset)
+static bool GetCharsetFromData(const char* aStyleSheetData,
+                               uint32_t aDataLength,
+                               nsACString& aCharset)
 {
   aCharset.Truncate();
   if (aDataLength <= sizeof(kCharsetSym) - 1)
-    return NS_ERROR_NOT_AVAILABLE;
-  uint32_t step = 1;
-  uint32_t pos = 0;
-  bool bigEndian = false;
-  // Determine the encoding type.  If we have a BOM, set aCharset to the
-  // charset listed for that BOM in http://www.w3.org/TR/REC-xml#sec-guessing;
-  // that way even if we don't have a valid @charset rule we can use the BOM to
-  // get a reasonable charset.  If we do have an @charset rule, the string from
-  // that will override this fallback setting of aCharset.
-  if (*aStyleSheetData == 0x40 && *(aStyleSheetData+1) == 0x63 /* '@c' */ ) {
-    // 1-byte ASCII-based encoding (ISO-8859-*, UTF-8, etc), no BOM
-    step = 1;
-    pos = 0;
-  }
-  else if (nsContentUtils::CheckForBOM(aStyleSheetData,
-                                       aDataLength, aCharset, &bigEndian)) {
-    if (aCharset.Equals("UTF-8")) {
-      step = 1;
-      pos = 3;
-    }
-    else if (aCharset.Equals("UTF-16")) {
-      step = 2;
-      pos = bigEndian ? 3 : 2;
-    }
-  }
-  else if (aStyleSheetData[0] == 0x00 &&
-           aStyleSheetData[1] == 0x40 &&
-           aStyleSheetData[2] == 0x00 &&
-           aStyleSheetData[3] == 0x63) {
-    // 2-byte big-endian encoding, no BOM
-    step = 2;
-    pos = 1;
-  }
-  else if (aStyleSheetData[0] == 0x40 &&
-           aStyleSheetData[1] == 0x00 &&
-           aStyleSheetData[2] == 0x63 &&
-           aStyleSheetData[3] == 0x00) {
-    // 2-byte little-endian encoding, no BOM
-    step = 2;
-    pos = 0;
-  }
-  else {
-    // no clue what this is
-    return NS_ERROR_UNEXPECTED;
+    return false;
+
+  if (strncmp(aStyleSheetData,
+              kCharsetSym,
+              sizeof(kCharsetSym) - 1)) {
+    return false;
   }
 
-  uint32_t index = 0;
-  while (pos < aDataLength && index < sizeof(kCharsetSym) - 1) {
-    if (aStyleSheetData[pos] != kCharsetSym[index]) {
-      // If we have a guess as to the charset based on the BOM, then
-      // we can just return NS_OK even if there is no valid @charset
-      // rule.
-      return aCharset.IsEmpty() ? NS_ERROR_NOT_AVAILABLE : NS_OK;
+  for (uint32_t i = sizeof(kCharsetSym) - 1; i < aDataLength; ++i) {
+    char c = aStyleSheetData[i];
+    if (c == '"') {
+      ++i;
+      if (i < aDataLength && aStyleSheetData[i] == ';') {
+        return true;
+      }
+      // fail
+      break;
     }
-    ++index;
-    pos += step;
+    aCharset.Append(c);
   }
 
-  nsAutoCString charset;
-  while (pos < aDataLength) {
-    if (aStyleSheetData[pos] == '"') {
-      break;
-    }
-
-    // casting to avoid ambiguities
-    charset.Append(char(aStyleSheetData[pos]));
-    pos += step;
-  }
-
-  // Check for the ending ';'
-  pos += step;
-  if (pos >= aDataLength || aStyleSheetData[pos] != ';') {
-    return aCharset.IsEmpty() ? NS_ERROR_NOT_AVAILABLE : NS_OK;
-  }
-
-  aCharset = charset;
-  return NS_OK;
+  // Did not see end quote or semicolon
+  aCharset.Truncate();
+  return false;
 }
 
 NS_IMETHODIMP
 SheetLoadData::OnDetermineCharset(nsIUnicharStreamLoader* aLoader,
                                   nsISupports* aContext,
                                   nsACString const& aSegment,
                                   nsACString& aCharset)
 {
   NS_PRECONDITION(!mOwningElement || mCharsetHint.IsEmpty(),
                   "Can't have element _and_ charset hint");
 
   LOG_URI("SheetLoadData::OnDetermineCharset for '%s'", mURI);
-  nsCOMPtr<nsIChannel> channel;
-  nsresult result = aLoader->GetChannel(getter_AddRefs(channel));
-  if (NS_FAILED(result))
-    channel = nullptr;
+
+  // The precedence is (per CSS3 Syntax 2012-11-08 ED):
+  // BOM
+  // Channel
+  // @charset rule
+  // charset attribute on the referrer
+  // encoding of the referrer
+  // UTF-8
 
   aCharset.Truncate();
 
-  /*
-   * First determine the charset (if one is indicated)
-   * 1)  Check nsIChannel::contentCharset
-   * 2)  Check @charset rules in the data
-   * 3)  Check "charset" attribute of the <LINK> or <?xml-stylesheet?>
-   *
-   * If all these fail to give us a charset, fall back on our default
-   * (parent sheet charset, document charset or ISO-8859-1 in that order)
-   */
-  if (channel) {
-    channel->GetContentCharset(aCharset);
+  if (nsContentUtils::CheckForBOM((const unsigned char*)aSegment.BeginReading(),
+                                  aSegment.Length(),
+                                  aCharset)) {
+    // aCharset is now either "UTF-16" or "UTF-8".
+    // The UTF-16 decoder will re-sniff and swallow the BOM.
+    // The UTF-8 decoder will swallow the BOM.
+    mCharset.Assign(aCharset);
+#ifdef PR_LOGGING
+    LOG(("  Setting from BOM to: %s", PromiseFlatCString(aCharset).get()));
+#endif
+    return NS_OK;
   }
 
-  result = NS_ERROR_NOT_AVAILABLE;
-
+  nsCOMPtr<nsIChannel> channel;
+  nsAutoCString specified;
+  aLoader->GetChannel(getter_AddRefs(channel));
+  if (channel) {
+    channel->GetContentCharset(specified);
+    if (EncodingUtils::FindEncodingForLabel(specified, aCharset)) {
+      mCharset.Assign(aCharset);
 #ifdef PR_LOGGING
-  if (! aCharset.IsEmpty()) {
-    LOG(("  Setting from HTTP to: %s", PromiseFlatCString(aCharset).get()));
-  }
+      LOG(("  Setting from HTTP to: %s", PromiseFlatCString(aCharset).get()));
 #endif
-
-  if (aCharset.IsEmpty()) {
-    //  We have no charset
-    //  Try @charset rule and BOM
-    result = GetCharsetFromData((const unsigned char*)aSegment.BeginReading(),
-                                aSegment.Length(), aCharset);
-#ifdef PR_LOGGING
-    if (NS_SUCCEEDED(result)) {
-      LOG(("  Setting from @charset rule or BOM: %s",
-           PromiseFlatCString(aCharset).get()));
+      return NS_OK;
     }
-#endif
   }
 
-  if (aCharset.IsEmpty()) {
-    // Now try the charset on the <link> or processing instruction
-    // that loaded us
-    if (mOwningElement) {
-      nsAutoString elementCharset;
-      mOwningElement->GetCharset(elementCharset);
-      LossyCopyUTF16toASCII(elementCharset, aCharset);
+  if (GetCharsetFromData(aSegment.BeginReading(),
+                         aSegment.Length(),
+                         specified)) {
+    if (EncodingUtils::FindEncodingForLabel(specified, aCharset)) {
+      // FindEncodingForLabel currently never returns UTF-16LE but will
+      // probably change to never return UTF-16 instead, so check both here
+      // to avoid relying on the exact behavior.
+      if (aCharset.EqualsLiteral("UTF-16") ||
+          aCharset.EqualsLiteral("UTF-16BE") ||
+          aCharset.EqualsLiteral("UTF-16LE")) {
+        // Be consistent with HTML <meta> handling in face of impossibility.
+        // When the @charset rule itself evidently was not UTF-16-encoded,
+        // it saying UTF-16 has to be a lie.
+        aCharset.AssignLiteral("UTF-8");
+      }
+      mCharset.Assign(aCharset);
 #ifdef PR_LOGGING
-      if (! aCharset.IsEmpty()) {
-        LOG(("  Setting from property on element: %s",
-             PromiseFlatCString(aCharset).get()));
-      }
+      LOG(("  Setting from @charset rule to: %s",
+          PromiseFlatCString(aCharset).get()));
 #endif
-    } else {
-      // If mCharsetHint is empty, that's ok; aCharset is known empty here
-      aCharset = mCharsetHint;
+      return NS_OK;
     }
   }
 
-  if (aCharset.IsEmpty() && mParentData) {
-    aCharset = mParentData->mCharset;
+  // Now try the charset on the <link> or processing instruction
+  // that loaded us
+  if (mOwningElement) {
+    nsAutoString specified16;
+    mOwningElement->GetCharset(specified16);
+    if (EncodingUtils::FindEncodingForLabel(specified16, aCharset)) {
+      mCharset.Assign(aCharset);
 #ifdef PR_LOGGING
-    if (! aCharset.IsEmpty()) {
-      LOG(("  Setting from parent sheet: %s",
-           PromiseFlatCString(aCharset).get()));
+      LOG(("  Setting from charset attribute to: %s",
+          PromiseFlatCString(aCharset).get()));
+#endif
+      return NS_OK;
     }
-#endif
   }
 
-  if (aCharset.IsEmpty() && mLoader->mDocument) {
+  // In the preload case, the value of the charset attribute on <link> comes
+  // in via mCharsetHint instead.
+  if (EncodingUtils::FindEncodingForLabel(mCharsetHint, aCharset)) {
+    mCharset.Assign(aCharset);
+#ifdef PR_LOGGING
+      LOG(("  Setting from charset attribute (preload case) to: %s",
+          PromiseFlatCString(aCharset).get()));
+#endif
+    return NS_OK;
+  }
+
+  // Try charset from the parent stylesheet.
+  if (mParentData) {
+    aCharset = mParentData->mCharset;
+    if (!aCharset.IsEmpty()) {
+      mCharset.Assign(aCharset);
+#ifdef PR_LOGGING
+      LOG(("  Setting from parent sheet to: %s",
+          PromiseFlatCString(aCharset).get()));
+#endif
+      return NS_OK;
+    }
+  }
+
+  if (mLoader->mDocument) {
     // no useful data on charset.  Try the document charset.
     aCharset = mLoader->mDocument->GetDocumentCharacterSet();
+    MOZ_ASSERT(!aCharset.IsEmpty());
+    mCharset.Assign(aCharset);
 #ifdef PR_LOGGING
-    LOG(("  Set from document: %s", PromiseFlatCString(aCharset).get()));
+    LOG(("  Setting from document to: %s", PromiseFlatCString(aCharset).get()));
 #endif
+    return NS_OK;
   }
 
-  if (aCharset.IsEmpty()) {
-    NS_WARNING("Unable to determine charset for sheet, using ISO-8859-1!");
+  aCharset.AssignLiteral("UTF-8");
+  mCharset = aCharset;
 #ifdef PR_LOGGING
-    LOG_WARN(("  Falling back to ISO-8859-1"));
+  LOG(("  Setting from default to: %s", PromiseFlatCString(aCharset).get()));
 #endif
-    aCharset.AssignLiteral("ISO-8859-1");
-  }
-
-  mCharset = aCharset;
   return NS_OK;
 }
 
 already_AddRefed<nsIURI>
 SheetLoadData::GetReferrerURI()
 {
   nsCOMPtr<nsIURI> uri;
   if (mParentData)