Bug 482921 part 6 - Use the internal encoding declaration in the XML View Source case. r=Olli.Pettay.
authorHenri Sivonen <hsivonen@iki.fi>
Tue, 01 Nov 2011 13:33:11 +0200
changeset 81156 e224c18554c3c6474c9f011b9ef0ef056cd85146
parent 81155 e06b72c42fc82ab88b579d107f99d0aa5dc7a587
child 81157 2266dd224ea03564189c12ecf5ae4b61d33ba119
push id90
push userffxbld
push dateSun, 29 Jan 2012 07:46:52 +0000
treeherdermozilla-release@acddb6b6a01c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersOlli.Pettay
bugs482921
milestone10.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 482921 part 6 - Use the internal encoding declaration in the XML View Source case. r=Olli.Pettay.
parser/html/nsHtml5StreamParser.cpp
parser/html/nsHtml5StreamParser.h
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@@ -49,16 +49,18 @@
 #include "nsHtml5Parser.h"
 #include "nsHtml5TreeBuilder.h"
 #include "nsHtml5AtomTable.h"
 #include "nsHtml5Module.h"
 #include "nsHtml5RefPtr.h"
 #include "nsIScriptError.h"
 #include "mozilla/Preferences.h"
 #include "nsHtml5Highlighter.h"
+#include "expat_config.h"
+#include "expat.h"
 
 using namespace mozilla;
 
 static NS_DEFINE_CID(kCharsetAliasCID, NS_CHARSETALIAS_CID);
 
 PRInt32 nsHtml5StreamParser::sTimerInitialDelay = 120;
 PRInt32 nsHtml5StreamParser::sTimerSubsequentDelay = 120;
 
@@ -398,23 +400,157 @@ nsHtml5StreamParser::SniffBOMlessUTF16Ba
   } else {
     mCharset.Assign("UTF-16BE");
   }
   mCharsetSource = kCharsetFromIrreversibleAutoDetection;
   mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
   mFeedChardet = false;
 }
 
+void
+nsHtml5StreamParser::MaybeSetEncodingFromExpat(const PRUnichar* aEncoding)
+{
+  nsDependentString utf16(aEncoding);
+  nsCAutoString utf8;
+  CopyUTF16toUTF8(utf16, utf8);
+  if (PreferredForInternalEncodingDecl(utf8)) {
+    mCharset.Assign(utf8);
+    mCharsetSource = kCharsetFromMetaTag; // closest for XML
+  }
+}
+
+// A separate user data struct is used instead of passing the
+// nsHtml5StreamParser instance as user data in order to avoid including
+// expat.h in nsHtml5StreamParser.h. Doing that would cause naming conflicts.
+// Using a separate user data struct also avoids bloating nsHtml5StreamParser
+// by one pointer.
+struct UserData {
+  XML_Parser mExpat;
+  nsHtml5StreamParser* mStreamParser;
+};
+
+// Using no-namespace handler callbacks to avoid including expat.h in
+// nsHtml5StreamParser.h, since doing so would cause naming conclicts.
+static void
+HandleXMLDeclaration(void* aUserData,
+                     const XML_Char* aVersion,
+                     const XML_Char* aEncoding,
+                     int aStandalone)
+{
+  UserData* ud = static_cast<UserData*>(aUserData);
+  ud->mStreamParser->MaybeSetEncodingFromExpat(
+      reinterpret_cast<const PRUnichar*>(aEncoding));
+  XML_StopParser(ud->mExpat, false);
+}
+
+static void
+HandleStartElement(void* aUserData,
+                   const XML_Char* aName,
+                   const XML_Char **aAtts)
+{
+  UserData* ud = static_cast<UserData*>(aUserData);
+  XML_StopParser(ud->mExpat, false);
+}
+
+static void
+HandleEndElement(void* aUserData,
+                 const XML_Char* aName)
+{
+  UserData* ud = static_cast<UserData*>(aUserData);
+  XML_StopParser(ud->mExpat, false);
+}
+
+static void
+HandleComment(void* aUserData,
+              const XML_Char* aName)
+{
+  UserData* ud = static_cast<UserData*>(aUserData);
+  XML_StopParser(ud->mExpat, false);
+}
+
+static void
+HandleProcessingInstruction(void* aUserData,
+                            const XML_Char* aTarget,
+                            const XML_Char* aData)
+{
+  UserData* ud = static_cast<UserData*>(aUserData);
+  XML_StopParser(ud->mExpat, false);
+}
+
 nsresult
 nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be null
                                       PRUint32 aCount,
                                       PRUint32* aWriteCount,
                                       PRUint32 aCountToSniffingLimit)
 {
   NS_ASSERTION(IsParserThread(), "Wrong thread!");
+  NS_ASSERTION(mCharsetSource < kCharsetFromMetaTag,
+      "Should not finalize sniffing when already confident.");
+  if (mMode == VIEW_SOURCE_XML) {
+    static const XML_Memory_Handling_Suite memsuite =
+      {
+        (void *(*)(size_t))moz_xmalloc,
+        (void *(*)(void *, size_t))moz_xrealloc,
+        moz_free
+      };
+
+    static const PRUnichar kExpatSeparator[] = { 0xFFFF, '\0' };
+
+    static const PRUnichar kISO88591[] =
+        { 'I', 'S', 'O', '-', '8', '8', '5', '9', '-', '1', '\0' };
+
+    UserData ud;
+    ud.mStreamParser = this;
+
+    // If we got this far, the stream didn't have a BOM. UTF-16-encoded XML
+    // documents MUST begin with a BOM. We don't support EBCDIC and such.
+    // Thus, at this point, what we have is garbage or something encoded using
+    // a rough ASCII superset. ISO-8859-1 allows us to decode ASCII bytes
+    // without throwing errors when bytes have the most significant bit set
+    // and without triggering expat's unknown encoding code paths. This is
+    // enough to be able to use expat to parse the XML declaration in order
+    // to extract the encoding name from it.
+    ud.mExpat = XML_ParserCreate_MM(kISO88591, &memsuite, kExpatSeparator);
+    XML_SetXmlDeclHandler(ud.mExpat, HandleXMLDeclaration);
+    XML_SetElementHandler(ud.mExpat, HandleStartElement, HandleEndElement);
+    XML_SetCommentHandler(ud.mExpat, HandleComment);
+    XML_SetProcessingInstructionHandler(ud.mExpat, HandleProcessingInstruction);
+    XML_SetUserData(ud.mExpat, static_cast<void*>(&ud));
+
+    XML_Status status = XML_STATUS_OK;
+    if (mSniffingBuffer) {
+      status = XML_Parse(ud.mExpat,
+                         reinterpret_cast<const char*>(mSniffingBuffer.get()),
+                         mSniffingLength,
+                         false);
+    }
+    if (status == XML_STATUS_OK &&
+        mCharsetSource < kCharsetFromMetaTag &&
+        aFromSegment) {
+      status = XML_Parse(ud.mExpat,
+                         reinterpret_cast<const char*>(aFromSegment),
+                         aCountToSniffingLimit,
+                         false);
+    }
+    XML_ParserFree(ud.mExpat);
+
+    if (mCharsetSource < kCharsetFromMetaTag) {
+      // Failed to get an encoding from the XML declaration. XML defaults
+      // confidently to UTF-8 in this case.
+      // It is also possible that the document has an XML declaration that is
+      // longer than 1024 bytes, but that case is not worth worrying about.
+      mCharset.AssignLiteral("UTF-8");
+      mCharsetSource = kCharsetFromMetaTag; // means confident
+    }
+
+    return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
+                                                                aCount,
+                                                                aWriteCount);
+  }
+
   // meta scan failed.
   if (mCharsetSource >= kCharsetFromHintPrevDoc) {
     mFeedChardet = false;
     return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
   }
   // Check for BOMless UTF-16 with Basic
   // Latin content for compat with IE. See bug 631751.
   SniffBOMlessUTF16BasicLatin(aFromSegment, aCountToSniffingLimit);
@@ -532,49 +668,62 @@ nsHtml5StreamParser::SniffStreamBytes(co
         break;
       default:
         mBomState = BOM_SNIFFING_OVER;
         break;
     }
   }
   // if we get here, there either was no BOM or the BOM sniffing isn't complete yet
   
-  if (!mMetaScanner) {
+  if (!mMetaScanner && (mMode == NORMAL || mMode == VIEW_SOURCE_HTML)) {
     mMetaScanner = new nsHtml5MetaScanner();
   }
   
   if (mSniffingLength + aCount >= NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE) {
     // this is the last buffer
-    PRUint32 countToSniffingLimit = NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
-    nsHtml5ByteReadable readable(aFromSegment, aFromSegment + countToSniffingLimit);
+    PRUint32 countToSniffingLimit =
+        NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
+    if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML) {
+      nsHtml5ByteReadable readable(aFromSegment, aFromSegment
+          + countToSniffingLimit);
+      mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
+      if (mUnicodeDecoder) {
+        mUnicodeDecoder->SetInputErrorBehavior(
+            nsIUnicodeDecoder::kOnError_Recover);
+        // meta scan successful
+        mCharsetSource = kCharsetFromMetaPrescan;
+        mFeedChardet = false;
+        mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
+        mMetaScanner = nsnull;
+        return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount,
+            aWriteCount);
+      }
+    }
+    return FinalizeSniffing(aFromSegment, aCount, aWriteCount,
+        countToSniffingLimit);
+  }
+
+  // not the last buffer
+  if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML) {
+    nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
     mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
     if (mUnicodeDecoder) {
-      mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
       // meta scan successful
+      mUnicodeDecoder->SetInputErrorBehavior(
+          nsIUnicodeDecoder::kOnError_Recover);
       mCharsetSource = kCharsetFromMetaPrescan;
       mFeedChardet = false;
       mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
       mMetaScanner = nsnull;
-      return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
+      return WriteSniffingBufferAndCurrentSegment(aFromSegment, 
+                                                  aCount,
+                                                  aWriteCount);
     }
-    return FinalizeSniffing(aFromSegment, aCount, aWriteCount, countToSniffingLimit);
   }
 
-  // not the last buffer
-  nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
-  mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
-  if (mUnicodeDecoder) {
-    // meta scan successful
-    mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
-    mCharsetSource = kCharsetFromMetaPrescan;
-    mFeedChardet = false;
-    mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
-    mMetaScanner = nsnull;
-    return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
-  }
   if (!mSniffingBuffer) {
     const mozilla::fallible_t fallible = mozilla::fallible_t();
     mSniffingBuffer = new (fallible)
       PRUint8[NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE];
     if (!mSniffingBuffer) {
       return NS_ERROR_OUT_OF_MEMORY;
     }
   }
@@ -937,32 +1086,19 @@ nsHtml5StreamParser::OnDataAvailable(nsI
                                                                 totalRead);
   if (NS_FAILED(mThread->Dispatch(dataAvailable, nsIThread::DISPATCH_NORMAL))) {
     NS_WARNING("Dispatching DataAvailable event failed.");
   }
   return rv;
 }
 
 bool
-nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
+nsHtml5StreamParser::PreferredForInternalEncodingDecl(nsACString& aEncoding)
 {
-  // This code needs to stay in sync with
-  // nsHtml5MetaScanner::tryCharset. Unfortunately, the
-  // trickery with member fields there leads to some copy-paste reuse. :-(
-  NS_ASSERTION(IsParserThread(), "Wrong thread!");
-  if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec
-    return false;
-  }
-
-  if (mReparseForbidden) {
-    return false; // not reparsing even if we wanted to
-  }
-
-  nsCAutoString newEncoding;
-  CopyUTF16toUTF8(*aEncoding, newEncoding);
+  nsCAutoString newEncoding(aEncoding);
   newEncoding.Trim(" \t\r\n\f");
   if (newEncoding.LowerCaseEqualsLiteral("utf-16") ||
       newEncoding.LowerCaseEqualsLiteral("utf-16be") ||
       newEncoding.LowerCaseEqualsLiteral("utf-16le")) {
     newEncoding.Assign("UTF-8");
   }
 
   nsresult rv = NS_OK;
@@ -999,21 +1135,46 @@ nsHtml5StreamParser::internalEncodingDec
       preferred.LowerCaseEqualsLiteral("utf-7") ||
       preferred.LowerCaseEqualsLiteral("jis_x0212-1990") ||
       preferred.LowerCaseEqualsLiteral("x-jis0208") ||
       preferred.LowerCaseEqualsLiteral("x-imap4-modified-utf7") ||
       preferred.LowerCaseEqualsLiteral("x-user-defined")) {
     // Not a rough ASCII superset
     return false;
   }
+  aEncoding.Assign(preferred);
+  return true;
+}
+
+bool
+nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
+{
+  // This code needs to stay in sync with
+  // nsHtml5MetaScanner::tryCharset. Unfortunately, the
+  // trickery with member fields there leads to some copy-paste reuse. :-(
+  NS_ASSERTION(IsParserThread(), "Wrong thread!");
+  if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec
+    return false;
+  }
+
+  if (mReparseForbidden) {
+    return false; // not reparsing even if we wanted to
+  }
+
+  nsCAutoString newEncoding;
+  CopyUTF16toUTF8(*aEncoding, newEncoding);
+
+  if (!PreferredForInternalEncodingDecl(newEncoding)) {
+    return false;
+  }
 
   // Avoid having the chardet ask for another restart after this restart
   // request.
   mFeedChardet = false;
-  mTreeBuilder->NeedsCharsetSwitchTo(preferred, kCharsetFromMetaTag);
+  mTreeBuilder->NeedsCharsetSwitchTo(newEncoding, kCharsetFromMetaTag);
   FlushTreeOpsAndDisarmTimer();
   Interrupt();
   // the tree op executor will cause the stream parser to terminate
   // if the charset switch request is accepted or it'll uninterrupt 
   // if the request failed. Note that if the restart request fails,
   // we don't bother trying to make chardet resume. Might as well
   // assume that chardet-requested restarts would fail, too.
   return true;
--- a/parser/html/nsHtml5StreamParser.h
+++ b/parser/html/nsHtml5StreamParser.h
@@ -203,16 +203,22 @@ class nsHtml5StreamParser : public nsISt
 
     void Terminate() {
       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
       mTerminated = true;
     }
     
     void DropTimer();
 
+    /**
+     * Sets mCharset and mCharsetSource appropriately for the XML View Source
+     * case if aEncoding names a supported rough ASCII superset.
+     */
+    void MaybeSetEncodingFromExpat(const PRUnichar* aEncoding);
+
   private:
 
 #ifdef DEBUG
     bool IsParserThread() {
       bool ret;
       mThread->IsOnCurrentThread(&ret);
       return ret;
     }
@@ -345,16 +351,26 @@ class nsHtml5StreamParser : public nsISt
      * @param aDecoderCharsetName The actual name for the decoder's charset
      *                            (UTF-16BE, UTF-16LE or UTF-8; the BOM has
      *                            been swallowed)
      */
     nsresult SetupDecodingFromBom(const char* aCharsetName,
                                   const char* aDecoderCharsetName);
 
     /**
+     * Become confident or resolve and encoding name to its preferred form.
+     * @param aEncoding the value of an internal encoding decl. Acts as an
+     *                  out param, too, when the method returns true.
+     * @return true if the parser needs to start using the new value of
+     *         aEncoding and false if the parser became confident or if
+     *         the encoding name did not specify a usable encoding
+     */
+    bool PreferredForInternalEncodingDecl(nsACString& aEncoding);
+
+    /**
      * Callback for mFlushTimer.
      */
     static void TimerCallback(nsITimer* aTimer, void* aClosure);
 
     /**
      * Parser thread entry point for (maybe) flushing the ops and posting
      * a flush runnable back on the main thread.
      */