Bug 716579 - Let a BOM override HTTP-level charset in the HTML and XML parsers. r=smaug.
authorHenri Sivonen <hsivonen@iki.fi>
Tue, 06 Nov 2012 13:57:51 +0200
changeset 121053 12288a8a5037d6389ac228d2972b1219102dfd7d
parent 121052 b0e7f060ac7a0593dad1c524e057bf3a9e69864d
child 121054 7c1dc22a0e39b1d95f33f8a0fa3773b454987c89
push id273
push userlsblakk@mozilla.com
push dateThu, 14 Feb 2013 23:19:38 +0000
treeherdermozilla-release@c5e807a3f8b8 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssmaug
bugs716579
milestone19.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 716579 - Let a BOM override HTTP-level charset in the HTML and XML parsers. r=smaug.
parser/html/nsHtml5StreamParser.cpp
parser/htmlparser/src/nsParser.cpp
parser/htmlparser/src/nsParser.h
parser/htmlparser/src/nsScanner.cpp
parser/htmlparser/src/nsScanner.h
parser/htmlparser/tests/mochitest/Makefile.in
parser/htmlparser/tests/mochitest/file_bug716579-16.html
parser/htmlparser/tests/mochitest/file_bug716579-16.html^headers^
parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml
parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml^headers^
parser/htmlparser/tests/mochitest/file_bug716579-8.html
parser/htmlparser/tests/mochitest/file_bug716579-8.html^headers^
parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml
parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml^headers^
parser/htmlparser/tests/mochitest/test_bug716579.html
parser/nsCharsetSource.h
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@@ -19,16 +19,17 @@
 #include "nsHtml5RefPtr.h"
 #include "nsIScriptError.h"
 #include "mozilla/Preferences.h"
 #include "nsHtml5Highlighter.h"
 #include "expat_config.h"
 #include "expat.h"
 #include "nsINestedURI.h"
 #include "nsCharsetSource.h"
+#include "nsIWyciwygChannel.h"
 
 using namespace mozilla;
 
 
 int32_t nsHtml5StreamParser::sTimerInitialDelay = 120;
 int32_t nsHtml5StreamParser::sTimerSubsequentDelay = 120;
 
 // static
@@ -490,18 +491,18 @@ HandleProcessingInstruction(void* aUserD
 
 nsresult
 nsHtml5StreamParser::FinalizeSniffing(const uint8_t* aFromSegment, // can be null
                                       uint32_t aCount,
                                       uint32_t* aWriteCount,
                                       uint32_t aCountToSniffingLimit)
 {
   NS_ASSERTION(IsParserThread(), "Wrong thread!");
-  NS_ASSERTION(mCharsetSource < kCharsetFromMetaTag,
-      "Should not finalize sniffing when already confident.");
+  NS_ASSERTION(mCharsetSource < kCharsetFromParentForced,
+      "Should not finalize sniffing when using forced charset.");
   if (mMode == VIEW_SOURCE_XML) {
     static const XML_Memory_Handling_Suite memsuite =
       {
         (void *(*)(size_t))moz_xmalloc,
         (void *(*)(void *, size_t))moz_xrealloc,
         moz_free
       };
 
@@ -629,16 +630,21 @@ nsHtml5StreamParser::FinalizeSniffing(co
 nsresult
 nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
                                       uint32_t aCount,
                                       uint32_t* aWriteCount)
 {
   NS_ASSERTION(IsParserThread(), "Wrong thread!");
   nsresult rv = NS_OK;
   uint32_t writeCount;
+
+  // mCharset and mCharsetSource potentially have come from channel or higher
+  // by now. If we find a BOM, SetupDecodingFromBom() will overwrite them.
+  // If we don't find a BOM, the previously set values of mCharset and
+  // mCharsetSource are not modified by the BOM sniffing here.
   for (uint32_t i = 0; i < aCount && mBomState != BOM_SNIFFING_OVER; i++) {
     switch (mBomState) {
       case BOM_SNIFFING_NOT_STARTED:
         NS_ASSERTION(i == 0, "Bad BOM sniffing state.");
         switch (*aFromSegment) {
           case 0xEF:
             mBomState = SEEN_UTF_8_FIRST_BYTE;
             break;
@@ -696,18 +702,46 @@ nsHtml5StreamParser::SniffStreamBytes(co
         }
         mBomState = BOM_SNIFFING_OVER;
         break;
       default:
         mBomState = BOM_SNIFFING_OVER;
         break;
     }
   }
-  // if we get here, there either was no BOM or the BOM sniffing isn't complete yet
+  // if we get here, there either was no BOM or the BOM sniffing isn't complete
+  // yet
   
+  if (mBomState == BOM_SNIFFING_OVER &&
+    mCharsetSource >= kCharsetFromChannel) {
+    // There was no BOM and the charset came from channel or higher. mCharset
+    // still contains the charset from the channel or higher as set by an
+    // earlier call to SetDocumentCharset(), since we didn't find a BOM and
+    // overwrite mCharset.
+    nsCOMPtr<nsICharsetConverterManager> convManager =
+      do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID);
+    convManager->GetUnicodeDecoder(mCharset.get(),
+                                   getter_AddRefs(mUnicodeDecoder));
+    if (mUnicodeDecoder) {
+      mUnicodeDecoder->SetInputErrorBehavior(
+          nsIUnicodeDecoder::kOnError_Recover);
+      mFeedChardet = false;
+      mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
+      mMetaScanner = nullptr;
+      return WriteSniffingBufferAndCurrentSegment(aFromSegment,
+                                                  aCount,
+                                                  aWriteCount);
+    } else {
+      // nsHTMLDocument is supposed to make sure this does not happen. Let's
+      // deal with this anyway, since who knows how kCharsetFromOtherComponent
+      // is used.
+      mCharsetSource = kCharsetFromWeakDocTypeDefault;
+    }
+  }
+
   if (!mMetaScanner && (mMode == NORMAL ||
                         mMode == VIEW_SOURCE_HTML ||
                         mMode == LOAD_AS_DATA)) {
     mMetaScanner = new nsHtml5MetaScanner();
   }
   
   if (mSniffingLength + aCount >= NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE) {
     // this is the last buffer
@@ -958,17 +992,23 @@ nsHtml5StreamParser::OnStartRequest(nsIR
     // Remember this in case chardet overwrites mCharsetSource
     mInitialEncodingWasFromParentFrame = true;
   }
 
   if (mCharsetSource >= kCharsetFromAutoDetection) {
     mFeedChardet = false;
   }
   
-  if (mCharsetSource <= kCharsetFromMetaPrescan) {
+  nsCOMPtr<nsIWyciwygChannel> wyciwygChannel(do_QueryInterface(mRequest));
+  if (wyciwygChannel) {
+    mReparseForbidden = true;
+    mFeedChardet = false;
+    // If we are reloading a document.open()ed doc, fall through to converter
+    // instantiation here and avoid BOM sniffing.
+  } else if (mCharsetSource < kCharsetFromParentForced) {
     // we aren't ready to commit to an encoding yet
     // leave converter uninstantiated for now
     return NS_OK;
   }
   
   nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
   NS_ENSURE_SUCCESS(rv, rv);
   rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
--- a/parser/htmlparser/src/nsParser.cpp
+++ b/parser/htmlparser/src/nsParser.cpp
@@ -36,16 +36,17 @@
 #include "nsDataHashtable.h"
 #include "nsIThreadPool.h"
 #include "nsXPCOMCIDInternal.h"
 #include "nsMimeTypes.h"
 #include "mozilla/CondVar.h"
 #include "mozilla/Mutex.h"
 #include "nsParserConstants.h"
 #include "nsCharsetSource.h"
+#include "nsContentUtils.h"
 
 using namespace mozilla;
 
 #define NS_PARSER_FLAG_PARSER_ENABLED         0x00000002
 #define NS_PARSER_FLAG_OBSERVERS_ENABLED      0x00000004
 #define NS_PARSER_FLAG_PENDING_CONTINUE_EVENT 0x00000008
 #define NS_PARSER_FLAG_FLUSH_TOKENS           0x00000020
 #define NS_PARSER_FLAG_CAN_TOKENIZE           0x00000040
@@ -1245,18 +1246,17 @@ nsParser::Parse(nsIURI* aURL,
   if (aURL) {
     nsAutoCString spec;
     nsresult rv = aURL->GetSpec(spec);
     if (rv != NS_OK) {
       return rv;
     }
     NS_ConvertUTF8toUTF16 theName(spec);
 
-    nsScanner* theScanner = new nsScanner(theName, false, mCharset,
-                                          mCharsetSource);
+    nsScanner* theScanner = new nsScanner(theName, false);
     CParserContext* pc = new CParserContext(mParserContext, theScanner, aKey,
                                             mCommand, aListener);
     if (pc && theScanner) {
       pc->mMultipart = true;
       pc->mContextType = CParserContext::eCTURL;
       pc->mDTDMode = aMode;
       PushContext(*pc);
 
@@ -1306,17 +1306,17 @@ nsParser::Parse(const nsAString& aSource
     CParserContext* pc = mParserContext;
     while (pc && pc->mKey != aKey) {
       pc = pc->mPrevContext;
     }
 
     if (!pc) {
       // Only make a new context if we don't have one, OR if we do, but has a
       // different context key.
-      nsScanner* theScanner = new nsScanner(mUnusedInput, mCharset, mCharsetSource);
+      nsScanner* theScanner = new nsScanner(mUnusedInput);
       NS_ENSURE_TRUE(theScanner, NS_ERROR_OUT_OF_MEMORY);
 
       eAutoDetectResult theStatus = eUnknownDetect;
 
       if (mParserContext &&
           mParserContext->mMimeType.EqualsLiteral("application/xml")) {
         // Ref. Bug 90379
         NS_ASSERTION(mDTD, "How come the DTD is null?");
@@ -1669,310 +1669,121 @@ nsParser::OnStartRequest(nsIRequest *req
   }
 
   rv = NS_OK;
 
   return rv;
 }
 
 
-#define UTF16_BOM "UTF-16"
-#define UTF16_BE "UTF-16BE"
-#define UTF16_LE "UTF-16LE"
-#define UTF8 "UTF-8"
-
 static inline bool IsSecondMarker(unsigned char aChar)
 {
   switch (aChar) {
     case '!':
     case '?':
     case 'h':
     case 'H':
       return true;
     default:
       return false;
   }
 }
 
 static bool
-DetectByteOrderMark(const unsigned char* aBytes, int32_t aLen,
-                    nsCString& oCharset, int32_t& oCharsetSource)
+ExtractCharsetFromXmlDeclaration(const unsigned char* aBytes, int32_t aLen,
+                                 nsCString& oCharset)
 {
- oCharsetSource= kCharsetFromAutoDetection;
- oCharset.Truncate();
- // See http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
- // for details
- // Also, MS Win2K notepad now generate 3 bytes BOM in UTF8 as UTF8 signature
- // We need to check that
- // UCS2 BOM FEFF = UTF8 EF BB BF
- switch(aBytes[0])
-	 {
-   case 0x00:
-     if((0x3C==aBytes[1]) && (0x00==aBytes[2])) {
-        // 00 3C 00
-        if(IsSecondMarker(aBytes[3])) {
-           // 00 3C 00 SM UTF-16,  big-endian, no Byte Order Mark 
-           oCharset.Assign(UTF16_BE); 
-           oCharsetSource = kCharsetFromByteOrderMark;
-        } 
-     }
-   break;
-   case 0x3C:
-     if(0x00==aBytes[1] && (0x00==aBytes[3])) {
-        // 3C 00 XX 00
-        if(IsSecondMarker(aBytes[2])) {
-           // 3C 00 SM 00 UTF-16,  little-endian, no Byte Order Mark 
-           oCharset.Assign(UTF16_LE); 
-           oCharsetSource = kCharsetFromByteOrderMark;
-        } 
-     // For html, meta tag detector is invoked before this so that we have 
-     // to deal only with XML here.
-     } else if(                     (0x3F==aBytes[1]) &&
-               (0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
-               (0 == PL_strncmp("<?xml", (char*)aBytes, 5 ))) {
-       // 3C 3F 78 6D
-       // ASCII characters are in their normal positions, so we can safely
-       // deal with the XML declaration in the old C way
-       // The shortest string so far (strlen==5):
-       // <?xml
-       int32_t i;
-       bool versionFound = false, encodingFound = false;
-       for (i=6; i < aLen && !encodingFound; ++i) {
-         // end of XML declaration?
-         if ((((char*)aBytes)[i] == '?') && 
-           ((i+1) < aLen) &&
-           (((char*)aBytes)[i+1] == '>')) {
-           break;
-         }
-         // Version is required.
-         if (!versionFound) {
-           // Want to avoid string comparisons, hence looking for 'n'
-           // and only if found check the string leading to it. Not
-           // foolproof, but fast.
-           // The shortest string allowed before this is  (strlen==13):
-           // <?xml version
-           if ((((char*)aBytes)[i] == 'n') &&
-             (i >= 12) && 
-             (0 == PL_strncmp("versio", (char*)(aBytes+i-6), 6 ))) {
-             // Fast forward through version
-             char q = 0;
-             for (++i; i < aLen; ++i) {
-               char qi = ((char*)aBytes)[i];
-               if (qi == '\'' || qi == '"') {
-                 if (q && q == qi) {
-                   //  ending quote
-                   versionFound = true;
-                   break;
-                 } else {
-                   // Starting quote
-                   q = qi;
-                 }
-               }
-             }
-           }
-         } else {
-           // encoding must follow version
-           // Want to avoid string comparisons, hence looking for 'g'
-           // and only if found check the string leading to it. Not
-           // foolproof, but fast.
-           // The shortest allowed string before this (strlen==26):
-           // <?xml version="1" encoding
-           if ((((char*)aBytes)[i] == 'g') &&
-             (i >= 25) && 
-             (0 == PL_strncmp("encodin", (char*)(aBytes+i-7), 7 ))) {
-             int32_t encStart = 0;
-             char q = 0;
-             for (++i; i < aLen; ++i) {
-               char qi = ((char*)aBytes)[i];
-               if (qi == '\'' || qi == '"') {
-                 if (q && q == qi) {
-                   int32_t count = i - encStart;
-                   // encoding value is invalid if it is UTF-16
-                   if (count > 0 && 
-                     (0 != PL_strcmp("UTF-16", (char*)(aBytes+encStart)))) {
-                     oCharset.Assign((char*)(aBytes+encStart),count);
-                     oCharsetSource = kCharsetFromMetaTag;
-                   }
-                   encodingFound = true;
-                   break;
-                 } else {
-                   encStart = i+1;
-                   q = qi;
-                 }
-               }
-             }
-           }
-         } // if (!versionFound)
-       } // for
-     }
-   break;
-   case 0xEF:  
-     if((0xBB==aBytes[1]) && (0xBF==aBytes[2])) {
-        // EF BB BF
-        // Win2K UTF-8 BOM
-        oCharset.Assign(UTF8); 
-        oCharsetSource= kCharsetFromByteOrderMark;
-     }
-   break;
-   case 0xFE:
-     if(0xFF==aBytes[1]) {
-        // FE FF UTF-16, big-endian 
-        oCharset.Assign(UTF16_BOM); 
-        oCharsetSource= kCharsetFromByteOrderMark;
-     }
-   break;
-   case 0xFF:
-     if(0xFE==aBytes[1]) {
-       // FF FE
-       // UTF-16, little-endian 
-       oCharset.Assign(UTF16_BOM); 
-       oCharsetSource= kCharsetFromByteOrderMark;
-     }
-   break;
-   // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
-   //   We do not care EBCIDIC here....
-   // }
-   // break;
- }  // switch
- return !oCharset.IsEmpty();
+  // This code is rather pointless to have. Might as well reuse expat as
+  // seen in nsHtml5StreamParser. -- hsivonen
+  oCharset.Truncate();
+  if ((aLen >= 5) &&
+      ('<' == aBytes[0]) &&
+      ('?' == aBytes[1]) &&
+      ('x' == aBytes[2]) &&
+      ('m' == aBytes[3]) &&
+      ('l' == aBytes[4])) {
+    int32_t i;
+    bool versionFound = false, encodingFound = false;
+    for (i = 6; i < aLen && !encodingFound; ++i) {
+      // end of XML declaration?
+      if ((((char*) aBytes)[i] == '?') &&
+          ((i + 1) < aLen) &&
+          (((char*) aBytes)[i + 1] == '>')) {
+        break;
+      }
+      // Version is required.
+      if (!versionFound) {
+        // Want to avoid string comparisons, hence looking for 'n'
+        // and only if found check the string leading to it. Not
+        // foolproof, but fast.
+        // The shortest string allowed before this is  (strlen==13):
+        // <?xml version
+        if ((((char*) aBytes)[i] == 'n') &&
+            (i >= 12) &&
+            (0 == PL_strncmp("versio", (char*) (aBytes + i - 6), 6))) {
+          // Fast forward through version
+          char q = 0;
+          for (++i; i < aLen; ++i) {
+            char qi = ((char*) aBytes)[i];
+            if (qi == '\'' || qi == '"') {
+              if (q && q == qi) {
+                //  ending quote
+                versionFound = true;
+                break;
+              } else {
+                // Starting quote
+                q = qi;
+              }
+            }
+          }
+        }
+      } else {
+        // encoding must follow version
+        // Want to avoid string comparisons, hence looking for 'g'
+        // and only if found check the string leading to it. Not
+        // foolproof, but fast.
+        // The shortest allowed string before this (strlen==26):
+        // <?xml version="1" encoding
+        if ((((char*) aBytes)[i] == 'g') && (i >= 25) && (0 == PL_strncmp(
+            "encodin", (char*) (aBytes + i - 7), 7))) {
+          int32_t encStart = 0;
+          char q = 0;
+          for (++i; i < aLen; ++i) {
+            char qi = ((char*) aBytes)[i];
+            if (qi == '\'' || qi == '"') {
+              if (q && q == qi) {
+                int32_t count = i - encStart;
+                // encoding value is invalid if it is UTF-16
+                if (count > 0 && (0 != PL_strcmp("UTF-16",
+                    (char*) (aBytes + encStart)))) {
+                  oCharset.Assign((char*) (aBytes + encStart), count);
+                }
+                encodingFound = true;
+                break;
+              } else {
+                encStart = i + 1;
+                q = qi;
+              }
+            }
+          }
+        }
+      } // if (!versionFound)
+    } // for
+  }
+  return !oCharset.IsEmpty();
 }
 
 inline const char
 GetNextChar(nsACString::const_iterator& aStart,
             nsACString::const_iterator& aEnd)
 {
   NS_ASSERTION(aStart != aEnd, "end of buffer");
   return (++aStart != aEnd) ? *aStart : '\0';
 }
 
-bool
-nsParser::DetectMetaTag(const char* aBytes,
-                        int32_t aLen,
-                        nsCString& aCharset,
-                        int32_t& aCharsetSource)
-{
-  aCharsetSource= kCharsetFromMetaTag;
-  aCharset.SetLength(0);
-
-  // XXX Only look inside HTML documents for now. For XML
-  // documents we should be looking inside the XMLDecl.
-  if (!mParserContext->mMimeType.EqualsLiteral(TEXT_HTML)) {
-    return false;
-  }
-
-  // Fast and loose parsing to determine if we have a complete
-  // META tag in this block, looking upto 2k into it.
-  const nsASingleFragmentCString& str =
-      Substring(aBytes, aBytes + NS_MIN(aLen, 2048));
-  // XXXldb Should be const_char_iterator when FindInReadable supports it.
-  nsACString::const_iterator begin, end;
-
-  str.BeginReading(begin);
-  str.EndReading(end);
-  nsACString::const_iterator currPos(begin);
-  nsACString::const_iterator tokEnd;
-  nsACString::const_iterator tagEnd(begin);
-
-  while (currPos != end) {
-    if (!FindCharInReadable('<', currPos, end))
-      break; // no tag found in this buffer
-
-    if (GetNextChar(currPos, end) == '!') {
-      if (GetNextChar(currPos, end) != '-' ||
-          GetNextChar(currPos, end) != '-') {
-        // If we only see a <! not followed by --, just skip to the next >.
-        if (!FindCharInReadable('>', currPos, end)) {
-          return false; // No more tags to follow.
-        }
-
-        // Continue searching for a meta tag following this "comment".
-        ++currPos;
-        continue;
-      }
-
-      // Found MDO ( <!-- ). Now search for MDC ( --[*s]> )
-      bool foundMDC = false;
-      bool foundMatch = false;
-      while (!foundMDC) {
-        if (GetNextChar(currPos, end) == '-' &&
-            GetNextChar(currPos, end) == '-') {
-          foundMatch = !foundMatch; // toggle until we've matching "--"
-        } else if (currPos == end) {
-          return false; // Couldn't find --[*s]> in this buffer
-        } else if (foundMatch && *currPos == '>') {
-          foundMDC = true; // found comment end delimiter.
-          ++currPos;
-        }
-      }
-      continue; // continue searching for META tag.
-    }
-
-    // Find the end of the tag, break if incomplete
-    tagEnd = currPos;
-    if (!FindCharInReadable('>', tagEnd, end))
-      break;
-
-    // If this is not a META tag, continue to next loop
-    if ( (*currPos != 'm' && *currPos != 'M') ||
-         (*(++currPos) != 'e' && *currPos != 'E') ||
-         (*(++currPos) != 't' && *currPos != 'T') ||
-         (*(++currPos) != 'a' && *currPos != 'A') ||
-         !nsCRT::IsAsciiSpace(*(++currPos))) {
-      currPos = tagEnd;
-      continue;
-    }
-
-    // If could not find "charset" in this tag, skip this tag and try next
-    tokEnd = tagEnd;
-    if (!CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("CHARSET"),
-                                       currPos, tokEnd)) {
-      currPos = tagEnd;
-      continue;
-    }
-    currPos = tokEnd;
-
-    // skip spaces before '='
-    while (*currPos == kSpace || *currPos == kNewLine ||
-           *currPos == kCR || *currPos == kTab) {
-      ++currPos;
-    }
-    // skip '='
-    if (*currPos != '=') {
-      currPos = tagEnd;
-      continue;
-    }
-    ++currPos;
-    // skip spaces after '='
-    while (*currPos == kSpace || *currPos == kNewLine ||
-           *currPos == kCR || *currPos == kTab) {
-      ++currPos;
-    }
-
-    // skip open quote
-    if (*currPos == '\'' || *currPos == '\"')
-      ++currPos;
-
-    // find the end of charset string
-    tokEnd = currPos;
-    while (*tokEnd != '\'' && *tokEnd != '\"' && tokEnd != tagEnd)
-      ++tokEnd;
-
-    // return true if we successfully got something for charset
-    if (currPos != tokEnd) {
-      aCharset.Assign(currPos.get(), tokEnd.get() - currPos.get());
-      return true;
-    }
-
-    // Nothing specified as charset, continue next loop
-    currPos = tagEnd;
-  }
-
-  return false;
-}
-
 static NS_METHOD
 NoOpParserWriteFunc(nsIInputStream* in,
                 void* closure,
                 const char* fromRawSegment,
                 uint32_t toOffset,
                 uint32_t count,
                 uint32_t *writeCount)
 {
@@ -1998,65 +1809,56 @@ ParserWriteFunc(nsIInputStream* in,
                 void* closure,
                 const char* fromRawSegment,
                 uint32_t toOffset,
                 uint32_t count,
                 uint32_t *writeCount)
 {
   nsresult result;
   ParserWriteStruct* pws = static_cast<ParserWriteStruct*>(closure);
-  const char* buf = fromRawSegment;
+  const unsigned char* buf =
+    reinterpret_cast<const unsigned char*> (fromRawSegment);
   uint32_t theNumRead = count;
 
   if (!pws) {
     return NS_ERROR_FAILURE;
   }
 
   if (pws->mNeedCharsetCheck) {
-    int32_t guessSource;
-    nsAutoCString guess;
+    pws->mNeedCharsetCheck = false;
+    int32_t source;
     nsAutoCString preferred;
+    nsAutoCString maybePrefer;
+    pws->mParser->GetDocumentCharset(preferred, source);
 
-    pws->mNeedCharsetCheck = false;
-    if (pws->mParser->DetectMetaTag(buf, theNumRead, guess, guessSource) ||
-        ((count >= 4) &&
-         DetectByteOrderMark((const unsigned char*)buf,
-                             theNumRead, guess, guessSource))) {
-      result = nsCharsetAlias::GetPreferred(guess, preferred);
-      // Only continue if it's a recognized charset and not
-      // one of a designated set that we ignore.
-      if (NS_SUCCEEDED(result) &&
-          ((kCharsetFromByteOrderMark == guessSource) ||
-           (!preferred.EqualsLiteral("UTF-16") &&
-            !preferred.EqualsLiteral("UTF-16BE") &&
-            !preferred.EqualsLiteral("UTF-16LE")))) {
-        guess = preferred;
-        pws->mParser->SetDocumentCharset(guess, guessSource);
-        pws->mParser->SetSinkCharset(preferred);
-        nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
-        if (channel) {
-          nsCOMPtr<nsISupports> cacheToken;
-          channel->GetCacheToken(getter_AddRefs(cacheToken));
-          if (cacheToken) {
-            nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
-            if (cacheDescriptor) {
-#ifdef DEBUG
-              nsresult rv =
-#endif
-                cacheDescriptor->SetMetaDataElement("charset",
-                                                    guess.get());
-              NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
-            }
-          }
+    // This code was bogus when I found it. It expects the BOM or the XML
+    // declaration to be entirely in the first network buffer. -- hsivonen
+    if (nsContentUtils::CheckForBOM(buf, count, maybePrefer)) {
+      // The decoder will swallow the BOM. The UTF-16 will re-sniff for
+      // endianness. The value of preferred is now either "UTF-8" or "UTF-16".
+      preferred.Assign(maybePrefer);
+      source = kCharsetFromByteOrderMark;
+    } else if (source < kCharsetFromChannel) {
+      nsAutoCString declCharset;
+
+      if (ExtractCharsetFromXmlDeclaration(buf, count, declCharset)) {
+        nsresult rv = nsCharsetAlias::GetPreferred(declCharset, maybePrefer);
+        if (NS_SUCCEEDED(rv)) {
+          preferred.Assign(maybePrefer);
+          source = kCharsetFromMetaTag;
         }
       }
     }
+
+    pws->mParser->SetDocumentCharset(preferred, source);
+    pws->mParser->SetSinkCharset(preferred);
+
   }
 
-  result = pws->mScanner->Append(buf, theNumRead, pws->mRequest);
+  result = pws->mScanner->Append(fromRawSegment, theNumRead, pws->mRequest);
   if (NS_SUCCEEDED(result)) {
     *writeCount = count;
   }
 
   return result;
 }
 
 nsresult
@@ -2098,18 +1900,17 @@ nsParser::OnDataAvailable(nsIRequest *re
         nsScannerIterator iter;
         theContext->mScanner->EndReading(iter);
         theContext->mScanner->SetPosition(iter, true);
       }
     }
 
     uint32_t totalRead;
     ParserWriteStruct pws;
-    pws.mNeedCharsetCheck =
-      (0 == sourceOffset) && (mCharsetSource < kCharsetFromMetaTag);
+    pws.mNeedCharsetCheck = true;
     pws.mParser = this;
     pws.mScanner = theContext->mScanner;
     pws.mRequest = request;
 
     rv = pIStream->ReadSegments(ParserWriteFunc, &pws, aLength, &totalRead);
     if (NS_FAILED(rv)) {
       return rv;
     }
--- a/parser/htmlparser/src/nsParser.h
+++ b/parser/htmlparser/src/nsParser.h
@@ -241,25 +241,16 @@ class nsParser : public nsIParser,
      */
     NS_IMETHOD GetDTD(nsIDTD** aDTD);
   
     /**
      * Get the nsIStreamListener for this parser
      */
     virtual nsIStreamListener* GetStreamListener();
 
-    /** 
-     * Detects the existence of a META tag with charset information in 
-     * the given buffer.
-     */
-    bool DetectMetaTag(const char* aBytes, 
-                         int32_t aLen, 
-                         nsCString& oCharset, 
-                         int32_t& oCharsetSource);
-
     void SetSinkCharset(nsACString& aCharset);
 
     /**
      *  Removes continue parsing events
      *  @update  kmcclusk 5/18/98
      */
 
     NS_IMETHODIMP CancelParsingEvents();
--- a/parser/htmlparser/src/nsScanner.cpp
+++ b/parser/htmlparser/src/nsScanner.cpp
@@ -52,18 +52,17 @@ const int   kBufsize=64;
  *  Use this constructor if you want i/o to be based on 
  *  a single string you hand in during construction.
  *  This short cut was added for Javascript.
  *
  *  @update  gess 5/12/98
  *  @param   aMode represents the parser mode (nav, other)
  *  @return  
  */
-nsScanner::nsScanner(const nsAString& anHTMLString, const nsACString& aCharset,
-                     int32_t aSource)
+nsScanner::nsScanner(const nsAString& anHTMLString)
 {
   MOZ_COUNT_CTOR(nsScanner);
 
   mSlidingBuffer = nullptr;
   mCountRemaining = 0;
   mFirstNonWhitespacePosition = -1;
   if (AppendToBuffer(anHTMLString)) {
     mSlidingBuffer->BeginReading(mCurrentPosition);
@@ -79,23 +78,18 @@ nsScanner::nsScanner(const nsAString& an
   mHasInvalidCharacter = false;
   mReplacementCharacter = PRUnichar(0x0);
 }
 
 /**
  *  Use this constructor if you want i/o to be based on strings 
  *  the scanner receives. If you pass a null filename, you
  *  can still provide data to the scanner via append.
- *
- *  @update  gess 5/12/98
- *  @param   aFilename --
- *  @return  
  */
-nsScanner::nsScanner(nsString& aFilename,bool aCreateStream,
-                     const nsACString& aCharset, int32_t aSource)
+nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
   : mFilename(aFilename)
 {
   MOZ_COUNT_CTOR(nsScanner);
   NS_ASSERTION(!aCreateStream, "This is always true.");
 
   mSlidingBuffer = nullptr;
 
   // XXX This is a big hack.  We need to initialize the iterators to something.
@@ -110,48 +104,43 @@ nsScanner::nsScanner(nsString& aFilename
   mIncremental = true;
   mFirstNonWhitespacePosition = -1;
   mCountRemaining = 0;
 
   mUnicodeDecoder = 0;
   mCharsetSource = kCharsetUninitialized;
   mHasInvalidCharacter = false;
   mReplacementCharacter = PRUnichar(0x0);
-  SetDocumentCharset(aCharset, aSource);
+  // XML defaults to UTF-8 and about:blank is UTF-8, too.
+  SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault);
 }
 
 nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource)
 {
   if (aSource < mCharsetSource) // priority is lower the the current one , just
     return NS_OK;
 
   nsresult res = NS_OK;
   if (!mCharset.IsEmpty())
   {
     bool same;
     res = nsCharsetAlias::Equals(aCharset, mCharset, &same);
     if(NS_SUCCEEDED(res) && same)
     {
+      mCharsetSource = aSource;
       return NS_OK; // no difference, don't change it
     }
   }
 
   // different, need to change it
   nsCString charsetName;
   res = nsCharsetAlias::GetPreferred(aCharset, charsetName);
+  MOZ_ASSERT(NS_SUCCEEDED(res), "Should never call with a bogus aCharset.");
 
-  if(NS_FAILED(res) && (mCharsetSource == kCharsetUninitialized))
-  {
-     // failed - unknown alias , fallback to ISO-8859-1
-    mCharset.AssignLiteral("ISO-8859-1");
-  }
-  else
-  {
-    mCharset.Assign(charsetName);
-  }
+  mCharset.Assign(charsetName);
 
   mCharsetSource = aSource;
 
   NS_ASSERTION(nsParser::GetCharsetConverterManager(),
                "Must have the charset converter manager!");
 
   res = nsParser::GetCharsetConverterManager()->
     GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
--- a/parser/htmlparser/src/nsScanner.h
+++ b/parser/htmlparser/src/nsScanner.h
@@ -37,39 +37,25 @@ private:
   nsReadEndCondition(const nsReadEndCondition& aOther); // No copying
   void operator=(const nsReadEndCondition& aOther); // No assigning
 };
 
 class nsScanner {
   public:
 
       /**
-       *  Use this constructor if you want i/o to be based on 
-       *  a single string you hand in during construction.
-       *  This short cut was added for Javascript.
-       *
-       *  @update  ftang 3/02/99
-       *  @param   aCharset charset
-       *  @param   aCharsetSource - where the charset info came from 
-       *  @param   aMode represents the parser mode (nav, other)
-       *  @return  
+       *  Use this constructor for the XML fragment parsing case
        */
-      nsScanner(const nsAString& anHTMLString, const nsACString& aCharset, int32_t aSource);
+      nsScanner(const nsAString& anHTMLString);
 
       /**
        *  Use this constructor if you want i/o to be based on 
        *  a file (therefore a stream) or just data you provide via Append().
-       *
-       *  @update  ftang 3/02/99
-       *  @param   aCharset charset
-       *  @param   aCharsetSource - where the charset info came from 
-       *  @param   aMode represents the parser mode (nav, other)
-       *  @return  
        */
-      nsScanner(nsString& aFilename,bool aCreateStream, const nsACString& aCharset, int32_t aSource);
+      nsScanner(nsString& aFilename, bool aCreateStream);
 
       ~nsScanner();
 
       /**
        *  retrieve next char from internal input stream
        *  
        *  @update  gess 3/25/98
        *  @param   ch is the char to accept new value
--- a/parser/htmlparser/tests/mochitest/Makefile.in
+++ b/parser/htmlparser/tests/mochitest/Makefile.in
@@ -70,16 +70,25 @@ MOCHITEST_FILES =	parser_datreader.js \
 		file_bug672453_http_unsupported.html \
 		file_bug672453_http_unsupported.html^headers^ \
 		file_bug672453_bomless_utf16.html \
 		file_bug672453_meta_utf16.html \
 		file_bug672453_meta_non_superset.html \
 		test_viewsource.html \
 		test_bug715112.html \
 		test_bug715739.html \
+		test_bug716579.html \
+		file_bug716579-8.html \
+		file_bug716579-8.html^headers^ \
+		file_bug716579-16.html \
+		file_bug716579-16.html^headers^ \
+		file_bug716579-8.xhtml \
+		file_bug716579-8.xhtml^headers^ \
+		file_bug716579-16.xhtml \
+		file_bug716579-16.xhtml^headers^ \
 		test_bug717180.html \
 		file_bug717180.html \
 		$(NULL)
 
 # Test disabled on mobile. See bug 737020.
 ifneq ($(OS_TARGET),Android)
 ifndef MOZ_PLATFORM_MAEMO
 		MOCHITEST_FILES += test_bug709083.html
new file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1cd07ca9a500831fbdcee3000209bf4354d5b49f
GIT binary patch
literal 82
zc%1wH&xWCxA(^3wA(Nqip@hMXfeT0{0{N*7c|g7%Lk5t{WyoPLWH4h;V6X*ZC5AN$
ON(|OOl{O6e7`g$|{tpEJ
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-16.html^headers^
@@ -0,0 +1,1 @@
+Content-Type: text/html; charset=windows-874
new file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cc828a7ce71ffe6a07fada4b66caa79442648c17
GIT binary patch
literal 214
zc${U9VG4pk6okLuo<jBjEfNTpc9+Uf!BTXgx>IMY##adv!+XmNZ)W!O%&7^SX^FYn
zdts!&nI7ob;DxJUrCDot3{+ARGJ}`*m*dzGk#o>h$^FYlj`B3h3#Rn!#ZPJr%KJ1Y
Yr20jnaXezj)|~#&J88>wKetc(4Pc@q8UO$Q
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml^headers^
@@ -0,0 +1,1 @@
+Content-Type: application/xhtml+xml; charset=windows-874
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.html
@@ -0,0 +1,3 @@
+<script>
+parent.html8 = "€";
+</script>
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.html^headers^
@@ -0,0 +1,1 @@
+Content-Type: text/html; charset=windows-874
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml
@@ -0,0 +1,7 @@
+<html xmlns="http://www.w3.org/1999/xhtml">
+<body>
+<script>
+parent.xml8 = "€";
+</script>
+</body>
+</html>
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml^headers^
@@ -0,0 +1,1 @@
+Content-Type: application/xhtml+xml; charset=windows-874
new file mode 100644
--- /dev/null
+++ b/parser/htmlparser/tests/mochitest/test_bug716579.html
@@ -0,0 +1,44 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=716579
+-->
+<head>
+  <meta charset="windows-1251">
+  <title>Test for Bug 716579</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=716579">Mozilla Bug 716579</a>
+<p id="display"></p>
+<pre id="test">
+<script type="application/javascript">
+
+/** Test for Bug 716579 **/
+
+var html8 = "FAIL";
+var html16 = "FAIL";
+var xml8 = "FAIL";
+var xml16 = "FAIL"; 
+
+SimpleTest.waitForExplicitFinish();
+
+window.onload = function() {
+  is(html8, "\u20AC", "HTML UTF-8 failed.");
+  is(html16, "\u20AC", "HTML UTF-16 failed.");
+  is(xml8, "\u20AC", "XML UTF-8 failed.");
+  is(xml16, "\u20AC", "XML UTF-16 failed.");
+  SimpleTest.finish();
+};
+
+</script>
+</pre>
+<div id="content" style="display: none">
+<iframe src="file_bug716579-8.html"></iframe>  
+<iframe src="file_bug716579-16.html"></iframe>  
+<iframe src="file_bug716579-8.xhtml"></iframe>  
+<iframe src="file_bug716579-16.xhtml"></iframe>  
+</div>
+</body>
+</html>
--- a/parser/nsCharsetSource.h
+++ b/parser/nsCharsetSource.h
@@ -12,17 +12,17 @@
 #define kCharsetFromDocTypeDefault      3 // This and up confident for XHR
 #define kCharsetFromCache               4
 #define kCharsetFromParentFrame         5
 #define kCharsetFromAutoDetection       6
 #define kCharsetFromHintPrevDoc         7
 #define kCharsetFromMetaPrescan         8 // this one and smaller: HTML5 Tentative
 #define kCharsetFromMetaTag             9 // this one and greater: HTML5 Confident
 #define kCharsetFromIrreversibleAutoDetection 10
-#define kCharsetFromByteOrderMark      11
-#define kCharsetFromChannel            12
-#define kCharsetFromOtherComponent     13
+#define kCharsetFromChannel            11
+#define kCharsetFromOtherComponent     12
+#define kCharsetFromByteOrderMark      13
 // Levels below here will be forced onto childframes too
 #define kCharsetFromParentForced       14
 #define kCharsetFromUserForced         15
 #define kCharsetFromPreviousLoading    16
 
 #endif /* nsCharsetSource_h_ */