Bug 716579 - Let a BOM override HTTP-level charset in the HTML and XML parsers. r=smaug.
authorHenri Sivonen <hsivonen@iki.fi>
Tue, 06 Nov 2012 13:57:51 +0200
changeset 112404 12288a8a5037d6389ac228d2972b1219102dfd7d
parent 112403 b0e7f060ac7a0593dad1c524e057bf3a9e69864d
child 112405 7c1dc22a0e39b1d95f33f8a0fa3773b454987c89
push id1997
push userakeybl@mozilla.com
push dateMon, 07 Jan 2013 21:25:26 +0000
treeherdermozilla-esr52@cf8750abee06 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssmaug
bugs716579
milestone19.0a1
Bug 716579 - Let a BOM override HTTP-level charset in the HTML and XML parsers. r=smaug.
parser/html/nsHtml5StreamParser.cpp
parser/htmlparser/src/nsParser.cpp
parser/htmlparser/src/nsParser.h
parser/htmlparser/src/nsScanner.cpp
parser/htmlparser/src/nsScanner.h
parser/htmlparser/tests/mochitest/Makefile.in
parser/htmlparser/tests/mochitest/file_bug716579-16.html
parser/htmlparser/tests/mochitest/file_bug716579-16.html^headers^
parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml
parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml^headers^
parser/htmlparser/tests/mochitest/file_bug716579-8.html
parser/htmlparser/tests/mochitest/file_bug716579-8.html^headers^
parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml
parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml^headers^
parser/htmlparser/tests/mochitest/test_bug716579.html
parser/nsCharsetSource.h
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@@ -19,16 +19,17 @@
 #include "nsHtml5RefPtr.h"
 #include "nsIScriptError.h"
 #include "mozilla/Preferences.h"
 #include "nsHtml5Highlighter.h"
 #include "expat_config.h"
 #include "expat.h"
 #include "nsINestedURI.h"
 #include "nsCharsetSource.h"
+#include "nsIWyciwygChannel.h"
 
 using namespace mozilla;
 
 
 int32_t nsHtml5StreamParser::sTimerInitialDelay = 120;
 int32_t nsHtml5StreamParser::sTimerSubsequentDelay = 120;
 
 // static
@@ -490,18 +491,18 @@ HandleProcessingInstruction(void* aUserD
 
 nsresult
 nsHtml5StreamParser::FinalizeSniffing(const uint8_t* aFromSegment, // can be null
                                       uint32_t aCount,
                                       uint32_t* aWriteCount,
                                       uint32_t aCountToSniffingLimit)
 {
   NS_ASSERTION(IsParserThread(), "Wrong thread!");
-  NS_ASSERTION(mCharsetSource < kCharsetFromMetaTag,
-      "Should not finalize sniffing when already confident.");
+  NS_ASSERTION(mCharsetSource < kCharsetFromParentForced,
+      "Should not finalize sniffing when using forced charset.");
   if (mMode == VIEW_SOURCE_XML) {
     static const XML_Memory_Handling_Suite memsuite =
       {
         (void *(*)(size_t))moz_xmalloc,
         (void *(*)(void *, size_t))moz_xrealloc,
         moz_free
       };
 
@@ -629,16 +630,21 @@ nsHtml5StreamParser::FinalizeSniffing(co
 nsresult
 nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
                                       uint32_t aCount,
                                       uint32_t* aWriteCount)
 {
   NS_ASSERTION(IsParserThread(), "Wrong thread!");
   nsresult rv = NS_OK;
   uint32_t writeCount;
+
+  // mCharset and mCharsetSource potentially have come from channel or higher
+  // by now. If we find a BOM, SetupDecodingFromBom() will overwrite them.
+  // If we don't find a BOM, the previously set values of mCharset and
+  // mCharsetSource are not modified by the BOM sniffing here.
   for (uint32_t i = 0; i < aCount && mBomState != BOM_SNIFFING_OVER; i++) {
     switch (mBomState) {
       case BOM_SNIFFING_NOT_STARTED:
         NS_ASSERTION(i == 0, "Bad BOM sniffing state.");
         switch (*aFromSegment) {
           case 0xEF:
             mBomState = SEEN_UTF_8_FIRST_BYTE;
             break;
@@ -696,18 +702,46 @@ nsHtml5StreamParser::SniffStreamBytes(co
         }
         mBomState = BOM_SNIFFING_OVER;
         break;
       default:
         mBomState = BOM_SNIFFING_OVER;
         break;
     }
   }
-  // if we get here, there either was no BOM or the BOM sniffing isn't complete yet
+  // if we get here, there either was no BOM or the BOM sniffing isn't complete
+  // yet
   
+  if (mBomState == BOM_SNIFFING_OVER &&
+    mCharsetSource >= kCharsetFromChannel) {
+    // There was no BOM and the charset came from channel or higher. mCharset
+    // still contains the charset from the channel or higher as set by an
+    // earlier call to SetDocumentCharset(), since we didn't find a BOM and
+    // overwrite mCharset.
+    nsCOMPtr<nsICharsetConverterManager> convManager =
+      do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID);
+    convManager->GetUnicodeDecoder(mCharset.get(),
+                                   getter_AddRefs(mUnicodeDecoder));
+    if (mUnicodeDecoder) {
+      mUnicodeDecoder->SetInputErrorBehavior(
+          nsIUnicodeDecoder::kOnError_Recover);
+      mFeedChardet = false;
+      mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
+      mMetaScanner = nullptr;
+      return WriteSniffingBufferAndCurrentSegment(aFromSegment,
+                                                  aCount,
+                                                  aWriteCount);
+    } else {
+      // nsHTMLDocument is supposed to make sure this does not happen. Let's
+      // deal with this anyway, since who knows how kCharsetFromOtherComponent
+      // is used.
+      mCharsetSource = kCharsetFromWeakDocTypeDefault;
+    }
+  }
+
   if (!mMetaScanner && (mMode == NORMAL ||
                         mMode == VIEW_SOURCE_HTML ||
                         mMode == LOAD_AS_DATA)) {
     mMetaScanner = new nsHtml5MetaScanner();
   }
   
   if (mSniffingLength + aCount >= NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE) {
     // this is the last buffer
@@ -958,17 +992,23 @@ nsHtml5StreamParser::OnStartRequest(nsIR
     // Remember this in case chardet overwrites mCharsetSource
     mInitialEncodingWasFromParentFrame = true;
   }
 
   if (mCharsetSource >= kCharsetFromAutoDetection) {
     mFeedChardet = false;
   }
   
-  if (mCharsetSource <= kCharsetFromMetaPrescan) {
+  nsCOMPtr<nsIWyciwygChannel> wyciwygChannel(do_QueryInterface(mRequest));
+  if (wyciwygChannel) {
+    mReparseForbidden = true;
+    mFeedChardet = false;
+    // If we are reloading a document.open()ed doc, fall through to converter
+    // instantiation here and avoid BOM sniffing.
+  } else if (mCharsetSource < kCharsetFromParentForced) {
     // we aren't ready to commit to an encoding yet
     // leave converter uninstantiated for now
     return NS_OK;
   }
   
   nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
   NS_ENSURE_SUCCESS(rv, rv);
   rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
--- a/parser/htmlparser/src/nsParser.cpp
+++ b/parser/htmlparser/src/nsParser.cpp
@@ -36,16 +36,17 @@
 #include "nsDataHashtable.h"
 #include "nsIThreadPool.h"
 #include "nsXPCOMCIDInternal.h"
 #include "nsMimeTypes.h"
 #include "mozilla/CondVar.h"
 #include "mozilla/Mutex.h"
 #include "nsParserConstants.h"
 #include "nsCharsetSource.h"
+#include "nsContentUtils.h"
 
 using namespace mozilla;
 
 #define NS_PARSER_FLAG_PARSER_ENABLED         0x00000002
 #define NS_PARSER_FLAG_OBSERVERS_ENABLED      0x00000004
 #define NS_PARSER_FLAG_PENDING_CONTINUE_EVENT 0x00000008
 #define NS_PARSER_FLAG_FLUSH_TOKENS           0x00000020
 #define NS_PARSER_FLAG_CAN_TOKENIZE           0x00000040
@@ -1245,18 +1246,17 @@ nsParser::Parse(nsIURI* aURL,
   if (aURL) {
     nsAutoCString spec;
     nsresult rv = aURL->GetSpec(spec);
     if (rv != NS_OK) {
       return rv;
     }
     NS_ConvertUTF8toUTF16 theName(spec);
 
-    nsScanner* theScanner = new nsScanner(theName, false, mCharset,
-                                          mCharsetSource);
+    nsScanner* theScanner = new nsScanner(theName, false);
     CParserContext* pc = new CParserContext(mParserContext, theScanner, aKey,
                                             mCommand, aListener);
     if (pc && theScanner) {
       pc->mMultipart = true;
       pc->mContextType = CParserContext::eCTURL;
       pc->mDTDMode = aMode;
       PushContext(*pc);
 
@@ -1306,17 +1306,17 @@ nsParser::Parse(const nsAString& aSource
     CParserContext* pc = mParserContext;
     while (pc && pc->mKey != aKey) {
       pc = pc->mPrevContext;
     }
 
     if (!pc) {
       // Only make a new context if we don't have one, OR if we do, but has a
       // different context key.
-      nsScanner* theScanner = new nsScanner(mUnusedInput, mCharset, mCharsetSource);
+      nsScanner* theScanner = new nsScanner(mUnusedInput);
       NS_ENSURE_TRUE(theScanner, NS_ERROR_OUT_OF_MEMORY);
 
       eAutoDetectResult theStatus = eUnknownDetect;
 
       if (mParserContext &&
           mParserContext->mMimeType.EqualsLiteral("application/xml")) {
         // Ref. Bug 90379
         NS_ASSERTION(mDTD, "How come the DTD is null?");
@@ -1669,310 +1669,121 @@ nsParser::OnStartRequest(nsIRequest *req
   }
 
   rv = NS_OK;
 
   return rv;
 }
 
 
-#define UTF16_BOM "UTF-16"
-#define UTF16_BE "UTF-16BE"
-#define UTF16_LE "UTF-16LE"
-#define UTF8 "UTF-8"
-
 static inline bool IsSecondMarker(unsigned char aChar)
 {
   switch (aChar) {
     case '!':
     case '?':
     case 'h':
     case 'H':
       return true;
     default:
       return false;
   }
 }
 
 static bool
-DetectByteOrderMark(const unsigned char* aBytes, int32_t aLen,
-                    nsCString& oCharset, int32_t& oCharsetSource)
+ExtractCharsetFromXmlDeclaration(const unsigned char* aBytes, int32_t aLen,
+                                 nsCString& oCharset)
 {
- oCharsetSource= kCharsetFromAutoDetection;
- oCharset.Truncate();
- // See http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
- // for details
- // Also, MS Win2K notepad now generate 3 bytes BOM in UTF8 as UTF8 signature
- // We need to check that
- // UCS2 BOM FEFF = UTF8 EF BB BF
- switch(aBytes[0])
-	 {
-   case 0x00:
-     if((0x3C==aBytes[1]) && (0x00==aBytes[2])) {
-        // 00 3C 00
-        if(IsSecondMarker(aBytes[3])) {
-           // 00 3C 00 SM UTF-16,  big-endian, no Byte Order Mark 
-           oCharset.Assign(UTF16_BE); 
-           oCharsetSource = kCharsetFromByteOrderMark;
-        } 
-     }
-   break;
-   case 0x3C:
-     if(0x00==aBytes[1] && (0x00==aBytes[3])) {
-        // 3C 00 XX 00
-        if(IsSecondMarker(aBytes[2])) {
-           // 3C 00 SM 00 UTF-16,  little-endian, no Byte Order Mark 
-           oCharset.Assign(UTF16_LE); 
-           oCharsetSource = kCharsetFromByteOrderMark;
-        } 
-     // For html, meta tag detector is invoked before this so that we have 
-     // to deal only with XML here.
-     } else if(                     (0x3F==aBytes[1]) &&
-               (0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
-               (0 == PL_strncmp("<?xml", (char*)aBytes, 5 ))) {
-       // 3C 3F 78 6D
-       // ASCII characters are in their normal positions, so we can safely
-       // deal with the XML declaration in the old C way
-       // The shortest string so far (strlen==5):
-       // <?xml
-       int32_t i;
-       bool versionFound = false, encodingFound = false;
-       for (i=6; i < aLen && !encodingFound; ++i) {
-         // end of XML declaration?
-         if ((((char*)aBytes)[i] == '?') && 
-           ((i+1) < aLen) &&
-           (((char*)aBytes)[i+1] == '>')) {
-           break;
-         }
-         // Version is required.
-         if (!versionFound) {
-           // Want to avoid string comparisons, hence looking for 'n'
-           // and only if found check the string leading to it. Not
-           // foolproof, but fast.
-           // The shortest string allowed before this is  (strlen==13):
-           // <?xml version
-           if ((((char*)aBytes)[i] == 'n') &&
-             (i >= 12) && 
-             (0 == PL_strncmp("versio", (char*)(aBytes+i-6), 6 ))) {
-             // Fast forward through version
-             char q = 0;
-             for (++i; i < aLen; ++i) {
-               char qi = ((char*)aBytes)[i];
-               if (qi == '\'' || qi == '"') {
-                 if (q && q == qi) {
-                   //  ending quote
-                   versionFound = true;
-                   break;
-                 } else {
-                   // Starting quote
-                   q = qi;
-                 }
-               }
-             }
-           }
-         } else {
-           // encoding must follow version
-           // Want to avoid string comparisons, hence looking for 'g'
-           // and only if found check the string leading to it. Not
-           // foolproof, but fast.
-           // The shortest allowed string before this (strlen==26):
-           // <?xml version="1" encoding
-           if ((((char*)aBytes)[i] == 'g') &&
-             (i >= 25) && 
-             (0 == PL_strncmp("encodin", (char*)(aBytes+i-7), 7 ))) {
-             int32_t encStart = 0;
-             char q = 0;
-             for (++i; i < aLen; ++i) {
-               char qi = ((char*)aBytes)[i];
-               if (qi == '\'' || qi == '"') {
-                 if (q && q == qi) {
-                   int32_t count = i - encStart;
-                   // encoding value is invalid if it is UTF-16
-                   if (count > 0 && 
-                     (0 != PL_strcmp("UTF-16", (char*)(aBytes+encStart)))) {
-                     oCharset.Assign((char*)(aBytes+encStart),count);
-                     oCharsetSource = kCharsetFromMetaTag;
-                   }
-                   encodingFound = true;
-                   break;
-                 } else {
-                   encStart = i+1;
-                   q = qi;
-                 }
-               }
-             }
-           }
-         } // if (!versionFound)
-       } // for
-     }
-   break;
-   case 0xEF:  
-     if((0xBB==aBytes[1]) && (0xBF==aBytes[2])) {
-        // EF BB BF
-        // Win2K UTF-8 BOM
-        oCharset.Assign(UTF8); 
-        oCharsetSource= kCharsetFromByteOrderMark;
-     }
-   break;
-   case 0xFE:
-     if(0xFF==aBytes[1]) {
-        // FE FF UTF-16, big-endian 
-        oCharset.Assign(UTF16_BOM); 
-        oCharsetSource= kCharsetFromByteOrderMark;
-     }
-   break;
-   case 0xFF:
-     if(0xFE==aBytes[1]) {
-       // FF FE
-       // UTF-16, little-endian 
-       oCharset.Assign(UTF16_BOM); 
-       oCharsetSource= kCharsetFromByteOrderMark;
-     }
-   break;
-   // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
-   //   We do not care EBCIDIC here....
-   // }
-   // break;
- }  // switch
- return !oCharset.IsEmpty();
+  // This code is rather pointless to have. Might as well reuse expat as
+  // seen in nsHtml5StreamParser. -- hsivonen
+  oCharset.Truncate();
+  if ((aLen >= 5) &&
+      ('<' == aBytes[0]) &&
+      ('?' == aBytes[1]) &&
+      ('x' == aBytes[2]) &&
+      ('m' == aBytes[3]) &&
+      ('l' == aBytes[4])) {
+    int32_t i;
+    bool versionFound = false, encodingFound = false;
+    for (i = 6; i < aLen && !encodingFound; ++i) {
+      // end of XML declaration?
+      if ((((char*) aBytes)[i] == '?') &&
+          ((i + 1) < aLen) &&
+          (((char*) aBytes)[i + 1] == '>')) {
+        break;
+      }
+      // Version is required.
+      if (!versionFound) {
+        // Want to avoid string comparisons, hence looking for 'n'
+        // and only if found check the string leading to it. Not
+        // foolproof, but fast.
+        // The shortest string allowed before this is  (strlen==13):
+        // <?xml version
+        if ((((char*) aBytes)[i] == 'n') &&
+            (i >= 12) &&
+            (0 == PL_strncmp("versio", (char*) (aBytes + i - 6), 6))) {
+          // Fast forward through version
+          char q = 0;
+          for (++i; i < aLen; ++i) {
+            char qi = ((char*) aBytes)[i];
+            if (qi == '\'' || qi == '"') {
+              if (q && q == qi) {
+                //  ending quote
+                versionFound = true;
+                break;
+              } else {
+                // Starting quote
+                q = qi;
+              }
+            }
+          }
+        }
+      } else {
+        // encoding must follow version
+        // Want to avoid string comparisons, hence looking for 'g'
+        // and only if found check the string leading to it. Not
+        // foolproof, but fast.
+        // The shortest allowed string before this (strlen==26):
+        // <?xml version="1" encoding
+        if ((((char*) aBytes)[i] == 'g') && (i >= 25) && (0 == PL_strncmp(
+            "encodin", (char*) (aBytes + i - 7), 7))) {
+          int32_t encStart = 0;
+          char q = 0;
+          for (++i; i < aLen; ++i) {
+            char qi = ((char*) aBytes)[i];
+            if (qi == '\'' || qi == '"') {
+              if (q && q == qi) {
+                int32_t count = i - encStart;
+                // encoding value is invalid if it is UTF-16
+                if (count > 0 && (0 != PL_strcmp("UTF-16",
+                    (char*) (aBytes + encStart)))) {
+                  oCharset.Assign((char*) (aBytes + encStart), count);
+                }
+                encodingFound = true;
+                break;
+              } else {
+                encStart = i + 1;
+                q = qi;
+              }
+            }
+          }
+        }
+      } // if (!versionFound)
+    } // for
+  }
+  return !oCharset.IsEmpty();
 }
 
 inline const char
 GetNextChar(nsACString::const_iterator& aStart,
             nsACString::const_iterator& aEnd)
 {
   NS_ASSERTION(aStart != aEnd, "end of buffer");
   return (++aStart != aEnd) ? *aStart : '\0';
 }
 
-bool
-nsParser::DetectMetaTag(const char* aBytes,
-                        int32_t aLen,
-                        nsCString& aCharset,
-                        int32_t& aCharsetSource)
-{
-  aCharsetSource= kCharsetFromMetaTag;
-  aCharset.SetLength(0);
-
-  // XXX Only look inside HTML documents for now. For XML
-  // documents we should be looking inside the XMLDecl.
-  if (!mParserContext->mMimeType.EqualsLiteral(TEXT_HTML)) {
-    return false;
-  }
-
-  // Fast and loose parsing to determine if we have a complete
-  // META tag in this block, looking upto 2k into it.
-  const nsASingleFragmentCString& str =
-      Substring(aBytes, aBytes + NS_MIN(aLen, 2048));
-  // XXXldb Should be const_char_iterator when FindInReadable supports it.
-  nsACString::const_iterator begin, end;
-
-  str.BeginReading(begin);
-  str.EndReading(end);
-  nsACString::const_iterator currPos(begin);
-  nsACString::const_iterator tokEnd;
-  nsACString::const_iterator tagEnd(begin);
-
-  while (currPos != end) {
-    if (!FindCharInReadable('<', currPos, end))
-      break; // no tag found in this buffer
-
-    if (GetNextChar(currPos, end) == '!') {
-      if (GetNextChar(currPos, end) != '-' ||
-          GetNextChar(currPos, end) != '-') {
-        // If we only see a <! not followed by --, just skip to the next >.
-        if (!FindCharInReadable('>', currPos, end)) {
-          return false; // No more tags to follow.
-        }
-
-        // Continue searching for a meta tag following this "comment".
-        ++currPos;
-        continue;
-      }
-
-      // Found MDO ( <!-- ). Now search for MDC ( --[*s]> )
-      bool foundMDC = false;
-      bool foundMatch = false;
-      while (!foundMDC) {
-        if (GetNextChar(currPos, end) == '-' &&
-            GetNextChar(currPos, end) == '-') {
-          foundMatch = !foundMatch; // toggle until we've matching "--"
-        } else if (currPos == end) {
-          return false; // Couldn't find --[*s]> in this buffer
-        } else if (foundMatch && *currPos == '>') {
-          foundMDC = true; // found comment end delimiter.
-          ++currPos;
-        }
-      }
-      continue; // continue searching for META tag.
-    }
-
-    // Find the end of the tag, break if incomplete
-    tagEnd = currPos;
-    if (!FindCharInReadable('>', tagEnd, end))
-      break;
-
-    // If this is not a META tag, continue to next loop
-    if ( (*currPos != 'm' && *currPos != 'M') ||
-         (*(++currPos) != 'e' && *currPos != 'E') ||
-         (*(++currPos) != 't' && *currPos != 'T') ||
-         (*(++currPos) != 'a' && *currPos != 'A') ||
-         !nsCRT::IsAsciiSpace(*(++currPos))) {
-      currPos = tagEnd;
-      continue;
-    }
-
-    // If could not find "charset" in this tag, skip this tag and try next
-    tokEnd = tagEnd;
-    if (!CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("CHARSET"),
-                                       currPos, tokEnd)) {
-      currPos = tagEnd;
-      continue;
-    }
-    currPos = tokEnd;
-
-    // skip spaces before '='
-    while (*currPos == kSpace || *currPos == kNewLine ||
-           *currPos == kCR || *currPos == kTab) {
-      ++currPos;
-    }
-    // skip '='
-    if (*currPos != '=') {
-      currPos = tagEnd;
-      continue;
-    }
-    ++currPos;
-    // skip spaces after '='
-    while (*currPos == kSpace || *currPos == kNewLine ||
-           *currPos == kCR || *currPos == kTab) {
-      ++currPos;
-    }
-
-    // skip open quote
-    if (*currPos == '\'' || *currPos == '\"')
-      ++currPos;
-
-    // find the end of charset string
-    tokEnd = currPos;
-    while (*tokEnd != '\'' && *tokEnd != '\"' && tokEnd != tagEnd)
-      ++tokEnd;
-
-    // return true if we successfully got something for charset
-    if (currPos != tokEnd) {
-      aCharset.Assign(currPos.get(), tokEnd.get() - currPos.get());
-      return true;
-    }
-
-    // Nothing specified as charset, continue next loop
-    currPos = tagEnd;
-  }
-
-  return false;
-}
-
 static NS_METHOD
 NoOpParserWriteFunc(nsIInputStream* in,
                 void* closure,
                 const char* fromRawSegment,
                 uint32_t toOffset,
                 uint32_t count,
                 uint32_t *writeCount)
 {
@@ -1998,65 +1809,56 @@ ParserWriteFunc(nsIInputStream* in,
                 void* closure,
                 const char* fromRawSegment,
                 uint32_t toOffset,
                 uint32_t count,
                 uint32_t *writeCount)
 {
   nsresult result;
   ParserWriteStruct* pws = static_cast<ParserWriteStruct*>(closure);
-  const char* buf = fromRawSegment;
+  const unsigned char* buf =
+    reinterpret_cast<const unsigned char*> (fromRawSegment);
   uint32_t theNumRead = count;
 
   if (!pws) {
     return NS_ERROR_FAILURE;
   }
 
   if (pws->mNeedCharsetCheck) {
-    int32_t guessSource;
-    nsAutoCString guess;
+    pws->mNeedCharsetCheck = false;
+    int32_t source;
     nsAutoCString preferred;
+    nsAutoCString maybePrefer;
+    pws->mParser->GetDocumentCharset(preferred, source);
 
-    pws->mNeedCharsetCheck = false;
-    if (pws->mParser->DetectMetaTag(buf, theNumRead, guess, guessSource) ||
-        ((count >= 4) &&
-         DetectByteOrderMark((const unsigned char*)buf,
-                             theNumRead, guess, guessSource))) {
-      result = nsCharsetAlias::GetPreferred(guess, preferred);
-      // Only continue if it's a recognized charset and not
-      // one of a designated set that we ignore.
-      if (NS_SUCCEEDED(result) &&
-          ((kCharsetFromByteOrderMark == guessSource) ||
-           (!preferred.EqualsLiteral("UTF-16") &&
-            !preferred.EqualsLiteral("UTF-16BE") &&
-            !preferred.EqualsLiteral("UTF-16LE")))) {
-        guess = preferred;
-        pws->mParser->SetDocumentCharset(guess, guessSource);
-        pws->mParser->SetSinkCharset(preferred);
-        nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
-        if (channel) {
-          nsCOMPtr<nsISupports> cacheToken;
-          channel->GetCacheToken(getter_AddRefs(cacheToken));
-          if (cacheToken) {
-            nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
-            if (cacheDescriptor) {
-#ifdef DEBUG
-              nsresult rv =
-#endif
-                cacheDescriptor->SetMetaDataElement("charset",
-                                                    guess.get());
-              NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
-            }
-          }
+    // This code was bogus when I found it. It expects the BOM or the XML
+    // declaration to be entirely in the first network buffer. -- hsivonen
+    if (nsContentUtils::CheckForBOM(buf, count, maybePrefer)) {
+      // The decoder will swallow the BOM. The UTF-16 will re-sniff for
+      // endianness. The value of preferred is now either "UTF-8" or "UTF-16".
+      preferred.Assign(maybePrefer);
+      source = kCharsetFromByteOrderMark;
+    } else if (source < kCharsetFromChannel) {
+      nsAutoCString declCharset;
+
+      if (ExtractCharsetFromXmlDeclaration(buf, count, declCharset)) {
+        nsresult rv = nsCharsetAlias::GetPreferred(declCharset, maybePrefer);
+        if (NS_SUCCEEDED(rv)) {
+          preferred.Assign(maybePrefer);
+          source = kCharsetFromMetaTag;
         }
       }
     }
+
+    pws->mParser->SetDocumentCharset(preferred, source);
+    pws->mParser->SetSinkCharset(preferred);
+
   }
 
-  result = pws->mScanner->Append(buf, theNumRead, pws->mRequest);
+  result = pws->mScanner->Append(fromRawSegment, theNumRead, pws->mRequest);
   if (NS_SUCCEEDED(result)) {
     *writeCount = count;
   }
 
   return result;
 }
 
 nsresult
@@ -2098,18 +1900,17 @@ nsParser::OnDataAvailable(nsIRequest *re
         nsScannerIterator iter;
         theContext->mScanner->EndReading(iter);
         theContext->mScanner->SetPosition(iter, true);
       }
     }
 
     uint32_t totalRead;
     ParserWriteStruct pws;
-    pws.mNeedCharsetCheck =
-      (0 == sourceOffset) && (mCharsetSource < kCharsetFromMetaTag);
+    pws.mNeedCharsetCheck = true;
     pws.mParser = this;
     pws.mScanner = theContext->mScanner;
     pws.mRequest = request;
 
     rv = pIStream->ReadSegments(ParserWriteFunc, &pws, aLength, &totalRead);
     if (NS_FAILED(rv)) {
       return rv;
     }
--- a/parser/htmlparser/src/nsParser.h
+++ b/parser/htmlparser/src/nsParser.h
@@ -241,25 +241,16 @@ class nsParser : public nsIParser,
      */
     NS_IMETHOD GetDTD(nsIDTD** aDTD);
   
     /**
      * Get the nsIStreamListener for this parser
      */
     virtual nsIStreamListener* GetStreamListener();
 
-    /** 
-     * Detects the existence of a META tag with charset information in 
-     * the given buffer.
-     */
-    bool DetectMetaTag(const char* aBytes, 
-                         int32_t aLen, 
-                         nsCString& oCharset, 
-                         int32_t& oCharsetSource);
-
     void SetSinkCharset(nsACString& aCharset);
 
     /**
      *  Removes continue parsing events
      *  @update  kmcclusk 5/18/98
      */
 
     NS_IMETHODIMP CancelParsingEvents();
--- a/parser/htmlparser/src/nsScanner.cpp
+++ b/parser/htmlparser/src/nsScanner.cpp
@@ -52,18 +52,17 @@ const int   kBufsize=64;
  *  Use this constructor if you want i/o to be based on 
  *  a single string you hand in during construction.
  *  This short cut was added for Javascript.
  *
  *  @update  gess 5/12/98
  *  @param   aMode represents the parser mode (nav, other)
  *  @return  
  */
-nsScanner::nsScanner(const nsAString& anHTMLString, const nsACString& aCharset,
-                     int32_t aSource)
+nsScanner::nsScanner(const nsAString& anHTMLString)
 {
   MOZ_COUNT_CTOR(nsScanner);
 
   mSlidingBuffer = nullptr;
   mCountRemaining = 0;
   mFirstNonWhitespacePosition = -1;
   if (AppendToBuffer(anHTMLString)) {
     mSlidingBuffer->BeginReading(mCurrentPosition);
@@ -79,23 +78,18 @@ nsScanner::nsScanner(const nsAString& an
   mHasInvalidCharacter = false;
   mReplacementCharacter = PRUnichar(0x0);
 }
 
 /**
  *  Use this constructor if you want i/o to be based on strings 
  *  the scanner receives. If you pass a null filename, you
  *  can still provide data to the scanner via append.
- *
- *  @update  gess 5/12/98
- *  @param   aFilename --
- *  @return  
  */
-nsScanner::nsScanner(nsString& aFilename,bool aCreateStream,
-                     const nsACString& aCharset, int32_t aSource)
+nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
   : mFilename(aFilename)
 {
   MOZ_COUNT_CTOR(nsScanner);
   NS_ASSERTION(!aCreateStream, "This is always true.");
 
   mSlidingBuffer = nullptr;
 
   // XXX This is a big hack.  We need to initialize the iterators to something.
@@ -110,48 +104,43 @@ nsScanner::nsScanner(nsString& aFilename
   mIncremental = true;
   mFirstNonWhitespacePosition = -1;
   mCountRemaining = 0;
 
   mUnicodeDecoder = 0;
   mCharsetSource = kCharsetUninitialized;
   mHasInvalidCharacter = false;
   mReplacementCharacter = PRUnichar(0x0);
-  SetDocumentCharset(aCharset, aSource);
+  // XML defaults to UTF-8 and about:blank is UTF-8, too.
+  SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault);
 }
 
 nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource)
 {
   if (aSource < mCharsetSource) // priority is lower the the current one , just
     return NS_OK;
 
   nsresult res = NS_OK;
   if (!mCharset.IsEmpty())
   {
     bool same;
     res = nsCharsetAlias::Equals(aCharset, mCharset, &same);
     if(NS_SUCCEEDED(res) && same)
     {
+      mCharsetSource = aSource;
       return NS_OK; // no difference, don't change it
     }
   }
 
   // different, need to change it
   nsCString charsetName;
   res = nsCharsetAlias::GetPreferred(aCharset, charsetName);
+  MOZ_ASSERT(NS_SUCCEEDED(res), "Should never call with a bogus aCharset.");
 
-  if(NS_FAILED(res) && (mCharsetSource == kCharsetUninitialized))
-  {
-     // failed - unknown alias , fallback to ISO-8859-1
-    mCharset.AssignLiteral("ISO-8859-1");
-  }
-  else
-  {
-    mCharset.Assign(charsetName);
-  }
+  mCharset.Assign(charsetName);
 
   mCharsetSource = aSource;
 
   NS_ASSERTION(nsParser::GetCharsetConverterManager(),
                "Must have the charset converter manager!");
 
   res = nsParser::GetCharsetConverterManager()->
     GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
--- a/parser/htmlparser/src/nsScanner.h
+++ b/parser/htmlparser/src/nsScanner.h
@@ -37,39 +37,25 @@ private:
   nsReadEndCondition(const nsReadEndCondition& aOther); // No copying
   void operator=(const nsReadEndCondition& aOther); // No assigning
 };
 
 class nsScanner {
   public:
 
       /**
-       *  Use this constructor if you want i/o to be based on 
-       *  a single string you hand in during construction.
-       *  This short cut was added for Javascript.
-       *
-       *  @update  ftang 3/02/99
-       *  @param   aCharset charset
-       *  @param   aCharsetSource - where the charset info came from 
-       *  @param   aMode represents the parser mode (nav, other)
-       *  @return  
+       *  Use this constructor for the XML fragment parsing case
        */
-      nsScanner(const nsAString& anHTMLString, const nsACString& aCharset, int32_t aSource);
+      nsScanner(const nsAString& anHTMLString);
 
       /**
        *  Use this constructor if you want i/o to be based on 
        *  a file (therefore a stream) or just data you provide via Append().
-       *
-       *  @update  ftang 3/02/99
-       *  @param   aCharset charset
-       *  @param   aCharsetSource - where the charset info came from 
-       *  @param   aMode represents the parser mode (nav, other)
-       *  @return  
        */
-      nsScanner(nsString& aFilename,bool aCreateStream, const nsACString& aCharset, int32_t aSource);
+      nsScanner(nsString& aFilename, bool aCreateStream);
 
       ~nsScanner();
 
       /**
        *  retrieve next char from internal input stream
        *  
        *  @update  gess 3/25/98
        *  @param   ch is the char to accept new value
--- a/parser/htmlparser/tests/mochitest/Makefile.in
+++ b/parser/htmlparser/tests/mochitest/Makefile.in
@@ -70,16 +70,25 @@ MOCHITEST_FILES =	parser_datreader.js \
 		file_bug672453_http_unsupported.html \
 		file_bug672453_http_unsupported.html^headers^ \
 		file_bug672453_bomless_utf16.html \
 		file_bug672453_meta_utf16.html \
 		file_bug672453_meta_non_superset.html \
 		test_viewsource.html \
 		test_bug715112.html \
 		test_bug715739.html \
+		test_bug716579.html \
+		file_bug716579-8.html \
+		file_bug716579-8.html^headers^ \
+		file_bug716579-16.html \
+		file_bug716579-16.html^headers^ \
+		file_bug716579-8.xhtml \
+		file_bug716579-8.xhtml^headers^ \
+		file_bug716579-16.xhtml \
+		file_bug716579-16.xhtml^headers^ \
 		test_bug717180.html \
 		file_bug717180.html \
 		$(NULL)
 
 # Test disabled on mobile. See bug 737020.
 ifneq ($(OS_TARGET),Android)
 ifndef MOZ_PLATFORM_MAEMO
 		MOCHITEST_FILES += test_bug709083.html
new file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1cd07ca9a500831fbdcee3000209bf4354d5b49f
GIT binary patch
literal 82
zc%1wH&xWCxA(^3wA(Nqip@hMXfeT0{0{N*7c|g7%Lk5t{WyoPLWH4h;V6X*ZC5AN$
ON(|OOl{O6e7`g$|{tpEJ
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-16.html^headers^
@@ -0,0 +1,1 @@
+Content-Type: text/html; charset=windows-874
new file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cc828a7ce71ffe6a07fada4b66caa79442648c17
GIT binary patch
literal 214
zc${U9VG4pk6okLuo<jBjEfNTpc9+Uf!BTXgx>IMY##adv!+XmNZ)W!O%&7^SX^FYn
zdts!&nI7ob;DxJUrCDot3{+ARGJ}`*m*dzGk#o>h$^FYlj`B3h3#Rn!#ZPJr%KJ1Y
Yr20jnaXezj)|~#&J88>wKetc(4Pc@q8UO$Q
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml^headers^
@@ -0,0 +1,1 @@
+Content-Type: application/xhtml+xml; charset=windows-874
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.html
@@ -0,0 +1,3 @@
+<script>
+parent.html8 = "€";
+</script>
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.html^headers^
@@ -0,0 +1,1 @@
+Content-Type: text/html; charset=windows-874
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml
@@ -0,0 +1,7 @@
+<html xmlns="http://www.w3.org/1999/xhtml">
+<body>
+<script>
+parent.xml8 = "€";
+</script>
+</body>
+</html>
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml^headers^
@@ -0,0 +1,1 @@
+Content-Type: application/xhtml+xml; charset=windows-874
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/test_bug716579.html
@@ -0,0 +1,44 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=716579
+-->
+<head>
+  <meta charset="windows-1251">
+  <title>Test for Bug 716579</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=716579">Mozilla Bug 716579</a>
+<p id="display"></p>
+<pre id="test">
+<script type="application/javascript">
+
+/** Test for Bug 716579 **/
+
+var html8 = "FAIL";
+var html16 = "FAIL";
+var xml8 = "FAIL";
+var xml16 = "FAIL"; 
+
+SimpleTest.waitForExplicitFinish();
+
+window.onload = function() {
+  is(html8, "\u20AC", "HTML UTF-8 failed.");
+  is(html16, "\u20AC", "HTML UTF-16 failed.");
+  is(xml8, "\u20AC", "XML UTF-8 failed.");
+  is(xml16, "\u20AC", "XML UTF-16 failed.");
+  SimpleTest.finish();
+};
+
+</script>
+</pre>
+<div id="content" style="display: none">
+<iframe src="file_bug716579-8.html"></iframe>  
+<iframe src="file_bug716579-16.html"></iframe>  
+<iframe src="file_bug716579-8.xhtml"></iframe>  
+<iframe src="file_bug716579-16.xhtml"></iframe>  
+</div>
+</body>
+</html>
--- a/parser/nsCharsetSource.h
+++ b/parser/nsCharsetSource.h
@@ -12,17 +12,17 @@
 #define kCharsetFromDocTypeDefault      3 // This and up confident for XHR
 #define kCharsetFromCache               4
 #define kCharsetFromParentFrame         5
 #define kCharsetFromAutoDetection       6
 #define kCharsetFromHintPrevDoc         7
 #define kCharsetFromMetaPrescan         8 // this one and smaller: HTML5 Tentative
 #define kCharsetFromMetaTag             9 // this one and greater: HTML5 Confident
 #define kCharsetFromIrreversibleAutoDetection 10
-#define kCharsetFromByteOrderMark      11
-#define kCharsetFromChannel            12
-#define kCharsetFromOtherComponent     13
+#define kCharsetFromChannel            11
+#define kCharsetFromOtherComponent     12
+#define kCharsetFromByteOrderMark      13
 // Levels below here will be forced onto childframes too
 #define kCharsetFromParentForced       14
 #define kCharsetFromUserForced         15
 #define kCharsetFromPreviousLoading    16
 
 #endif /* nsCharsetSource_h_ */