Bug 631751 - Sniff Basic Latin BOMless UTF-16 for IE compat. r=bzbarsky, a=bzbarsky.
authorHenri Sivonen <hsivonen@iki.fi>
Wed, 16 Feb 2011 08:40:35 +0200
changeset 62647 8eb1b3531dd96be66ae821dbce402941ed63540d
parent 62646 cc67cf730f015242834055cf62cff39a0613888a
child 62648 87537a1f7e1f6fb73e9660456c8d54a3ce931df8
push idunknown
push userunknown
push dateunknown
reviewersbzbarsky, bzbarsky
bugs631751
milestone2.0b12pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 631751 - Sniff Basic Latin BOMless UTF-16 for IE compat. r=bzbarsky, a=bzbarsky.
extensions/universalchardet/tests/Makefile.in
extensions/universalchardet/tests/bug631751be_text.html
extensions/universalchardet/tests/bug631751le_text.html
extensions/universalchardet/tests/test_bug631751be.html
extensions/universalchardet/tests/test_bug631751le.html
parser/html/nsHtml5StreamParser.cpp
parser/html/nsHtml5StreamParser.h
parser/htmlparser/public/nsIParser.h
--- a/extensions/universalchardet/tests/Makefile.in
+++ b/extensions/universalchardet/tests/Makefile.in
@@ -65,12 +65,16 @@ relativesrcdir = extensions/universalcha
 		test_bug431054.html \
 		test_bug431054-japanese.html \
 		bug488426_text.html \
 		test_bug488426.html \
 		bug547487_text.html \
 		test_bug547487.html \
 		bug620106_text.html \
 		test_bug620106.html \
+		bug631751le_text.html \
+		test_bug631751le.html \
+		bug631751be_text.html \
+		test_bug631751be.html \
 		$(NULL)
 
 libs:: $(_TEST_FILES)
 	$(INSTALL) $(foreach f,$^,"$f") $(DEPTH)/_tests/testing/mochitest/chrome/$(relativesrcdir)
new file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..104d50399860172e3ba0d4b0a41529b4afb00686
GIT binary patch
literal 354
zc$~eEy$*sv5QM*)r+^1A#>R?J;7=uy0t-t7BbNXcTw;8ChSiv$y}4X&W_NaGbHqe!
zIkKVPOwNvw22V#zLO}e(u9Vbo=5e~MY6+?EjfHTpzLTn#S1Rw!6oXF4NjZC|R;<My
uHs6@8RNqkvAA~38X};pF*PE+X;&PT5SY^J`MH4koWB=IcJM*>C=g&rs{5hNe
new file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a1e5f6bfbdc404e098ca47a4afdb7a0586a60133
GIT binary patch
literal 366
zc$~eEy$*sv5QM*)r@#j=#>R?J;7^6YLJLX+6D~n0yu|SK4vUEadUM&_&hE_4W+PS{
z$k=lw=faVY8qbr4n1E=39u!nx<Z+5dwV2fM5v>)iSWetl6B9X{t@F|gxykJ$)|Q>n
u+myBO3iTbO@>KaSf6$k7`n^r{^4(0F1FK9oyGv2g8~Wd7bLVld-~B!$@jO%j
new file mode 100644
--- /dev/null
+++ b/extensions/universalchardet/tests/test_bug631751be.html
@@ -0,0 +1,33 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=631751
+-->
+<head>
+  <title>Test for Bug 631751</title>
+  <script type="text/javascript" 
+          src="chrome://mochikit/content/MochiKit/packed.js"></script>
+  <script type="text/javascript" 
+          src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
+          </script>
+  <script type="text/javascript" src="CharsetDetectionTests.js"></script>
+  <link rel="stylesheet" type="text/css" 
+        href="chrome://mochikit/content/tests/SimpleTest/test.css" />
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=631751">Mozilla Bug 631751</a>
+<p id="display"></p>
+<div id="content" style="display: none">  
+</div>
+<iframe id="testframe"></iframe>
+<pre id="test">
+<script class="testbody" type="text/javascript">
+/** Test for Bug 631751 **/
+/* Note! This test uses the chardet test harness but doesn't test chardet! */
+CharsetDetectionTests("bug631751be_text.html",
+		      "UTF-16BE",
+		      new Array(""));
+</script>
+</pre>
+</body>
+</html>
new file mode 100644
--- /dev/null
+++ b/extensions/universalchardet/tests/test_bug631751le.html
@@ -0,0 +1,33 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=631751
+-->
+<head>
+  <title>Test for Bug 631751</title>
+  <script type="text/javascript" 
+          src="chrome://mochikit/content/MochiKit/packed.js"></script>
+  <script type="text/javascript" 
+          src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
+          </script>
+  <script type="text/javascript" src="CharsetDetectionTests.js"></script>
+  <link rel="stylesheet" type="text/css" 
+        href="chrome://mochikit/content/tests/SimpleTest/test.css" />
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=631751">Mozilla Bug 631751</a>
+<p id="display"></p>
+<div id="content" style="display: none">  
+</div>
+<iframe id="testframe"></iframe>
+<pre id="test">
+<script class="testbody" type="text/javascript">
+/** Test for Bug 631751 **/
+/* Note! This test uses the chardet test harness but doesn't test chardet! */
+CharsetDetectionTests("bug631751le_text.html",
+		      "UTF-16LE",
+		      new Array(""));
+</script>
+</pre>
+</body>
+</html>
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@@ -329,28 +329,74 @@ nsHtml5StreamParser::SetupDecodingFromBo
   mFeedChardet = PR_FALSE;
   mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
   mSniffingBuffer = nsnull;
   mMetaScanner = nsnull;
   mBomState = BOM_SNIFFING_OVER;
   return rv;
 }
 
+void
+nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment,
+                                                 PRUint32 aCountToSniffingLimit)
+{
+  // Make sure there's enough data. Require room for "<title></title>"
+  if (mSniffingLength + aCountToSniffingLimit < 30) {
+    return;
+  }
+  // even-numbered bytes tracked at 0, odd-numbered bytes tracked at 1
+  PRBool byteNonZero[2] = { PR_FALSE, PR_FALSE };
+  PRUint32 i = 0;
+  if (mSniffingBuffer) {
+    for (; i < mSniffingLength; ++i) {
+      if (mSniffingBuffer[i]) {
+        if (byteNonZero[1 - (i % 2)]) {
+          return;
+        }
+        byteNonZero[i % 2] = PR_TRUE;
+      }
+    }
+  }
+  if (aFromSegment) {
+    for (PRUint32 j = 0; j < aCountToSniffingLimit; ++j) {
+      if (aFromSegment[j]) {
+        if (byteNonZero[1 - ((i + j) % 2)]) {
+          return;
+        }
+        byteNonZero[(i + j) % 2] = PR_TRUE;
+      }
+    }
+  }
+
+  if (byteNonZero[0]) {
+    mCharset.Assign("UTF-16LE");
+  } else {
+    mCharset.Assign("UTF-16BE");
+  }
+  mCharsetSource = kCharsetFromIrreversibleAutoDetection;
+  mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
+  mFeedChardet = PR_FALSE;
+}
+
 nsresult
 nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be null
                                       PRUint32 aCount,
                                       PRUint32* aWriteCount,
                                       PRUint32 aCountToSniffingLimit)
 {
   NS_ASSERTION(IsParserThread(), "Wrong thread!");
   // meta scan failed.
   if (mCharsetSource >= kCharsetFromHintPrevDoc) {
     mFeedChardet = PR_FALSE;
     return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
   }
+  // Check for BOMless UTF-16 with Basic
+  // Latin content for compat with IE. See bug 631751.
+  SniffBOMlessUTF16BasicLatin(aFromSegment, aCountToSniffingLimit);
+  // the charset may have been set now
   // maybe try chardet now; 
   if (mFeedChardet) {
     PRBool dontFeed;
     nsresult rv;
     if (mSniffingBuffer) {
       rv = mChardet->DoIt((const char*)mSniffingBuffer.get(), mSniffingLength, &dontFeed);
       mFeedChardet = !dontFeed;
       NS_ENSURE_SUCCESS(rv, rv);
--- a/parser/html/nsHtml5StreamParser.h
+++ b/parser/html/nsHtml5StreamParser.h
@@ -251,17 +251,23 @@ class nsHtml5StreamParser : public nsISt
                               PRUint32* aWriteCount);
 
     /**
      * Push bytes from network when there is a Unicode decoder already
      */
     nsresult WriteStreamBytes(const PRUint8* aFromSegment,
                               PRUint32 aCount,
                               PRUint32* aWriteCount);
-    
+
+    /**
+     * Check whether every other byte in the sniffing buffer is zero.
+     */
+    void SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment,
+                                     PRUint32 aCountToSniffingLimit);
+
     /**
      * <meta charset> scan failed. Try chardet if applicable. After this, the
      * the parser will have some encoding even if a last resolt fallback.
      *
      * @param aFromSegment The current network buffer or null if the sniffing
      *                     buffer is being flushed due to network stream ending.
      * @param aCount       The number of bytes in aFromSegment (ignored if
      *                     aFromSegment is null)
--- a/parser/htmlparser/public/nsIParser.h
+++ b/parser/htmlparser/public/nsIParser.h
@@ -93,23 +93,24 @@ enum eParserDocType {
 #define kCharsetFromUserDefault         2
 #define kCharsetFromDocTypeDefault      3
 #define kCharsetFromCache               4
 #define kCharsetFromParentFrame         5
 #define kCharsetFromAutoDetection       6
 #define kCharsetFromHintPrevDoc         7
 #define kCharsetFromMetaPrescan         8 // this one and smaller: HTML5 Tentative
 #define kCharsetFromMetaTag             9 // this one and greater: HTML5 Confident
-#define kCharsetFromByteOrderMark      10
-#define kCharsetFromChannel            11
-#define kCharsetFromOtherComponent     12
+#define kCharsetFromIrreversibleAutoDetection 10
+#define kCharsetFromByteOrderMark      11
+#define kCharsetFromChannel            12
+#define kCharsetFromOtherComponent     13
 // Levels below here will be forced onto childframes too
-#define kCharsetFromParentForced       13
-#define kCharsetFromUserForced         14
-#define kCharsetFromPreviousLoading    15
+#define kCharsetFromParentForced       14
+#define kCharsetFromUserForced         15
+#define kCharsetFromPreviousLoading    16
 
 enum eStreamState {eNone,eOnStart,eOnDataAvail,eOnStop};
 
 /** 
  *  FOR DEBUG PURPOSE ONLY
  *
  *  Use this interface to query objects that contain content information.
  *  Ex. Parser can trigger dump content by querying the sink that has