Bug 741776 - Treat JSON, WebVTT and AppCache manifests as UTF-8 when loaded as plain text. r=Ehsan
authorHenri Sivonen <hsivonen@hsivonen.fi>
Thu, 09 Jun 2016 14:29:30 +0300
changeset 324917 9f1de4aeae3a3495b6425668acbafbdf7eebb18b
parent 324916 9c229cf88ec224739f127082c1e8ec4c322bcd6f
child 324918 da9707f081f9ee54f59bc52da81f22540f64d313
push id24
push usermaklebus@msu.edu
push dateTue, 20 Dec 2016 03:11:33 +0000
reviewersEhsan
bugs741776
milestone53.0a1
Bug 741776 - Treat JSON, WebVTT and AppCache manifests as UTF-8 when loaded as plain text. r=Ehsan MozReview-Commit-ID: 5UvYqJVvX0r
dom/base/nsContentUtils.cpp
dom/base/nsContentUtils.h
dom/html/nsHTMLDocument.cpp
dom/html/reftests/741776-1-ref.html
dom/html/reftests/741776-1.vtt
dom/html/reftests/reftest.list
parser/html/nsHtml5StreamParser.cpp
parser/nsCharsetSource.h
uriloader/exthandler/nsExternalHelperAppService.cpp
--- a/dom/base/nsContentUtils.cpp
+++ b/dom/base/nsContentUtils.cpp
@@ -3753,16 +3753,26 @@ nsContentUtils::IsPlainTextType(const ns
   return aContentType.EqualsLiteral(TEXT_PLAIN) ||
          aContentType.EqualsLiteral(TEXT_CSS) ||
          aContentType.EqualsLiteral(TEXT_CACHE_MANIFEST) ||
          aContentType.EqualsLiteral(TEXT_VTT) ||
          IsScriptType(aContentType);
 }
 
 bool
+nsContentUtils::IsUtf8OnlyPlainTextType(const nsACString& aContentType)
+{
+  // NOTE: This must be a subset of the list in IsPlainTextType().
+  return aContentType.EqualsLiteral(TEXT_CACHE_MANIFEST) ||
+         aContentType.EqualsLiteral(APPLICATION_JSON) ||
+         aContentType.EqualsLiteral(TEXT_JSON) ||
+         aContentType.EqualsLiteral(TEXT_VTT);
+}
+
+bool
 nsContentUtils::GetWrapperSafeScriptFilename(nsIDocument* aDocument,
                                              nsIURI* aURI,
                                              nsACString& aScriptURI,
                                              nsresult* aRv)
 {
   MOZ_ASSERT(aRv);
   bool scriptFileNameModified = false;
   *aRv = NS_OK;
--- a/dom/base/nsContentUtils.h
+++ b/dom/base/nsContentUtils.h
@@ -1021,26 +1021,32 @@ public:
   static bool IsChromeDoc(nsIDocument *aDocument);
 
   /**
    * Returns true if aDocument is in a docshell whose parent is the same type
    */
   static bool IsChildOfSameType(nsIDocument* aDoc);
 
   /**
-  '* Returns true if the content-type is any of the supported script types.
+   * Returns true if the content-type is any of the supported script types.
    */
   static bool IsScriptType(const nsACString& aContentType);
 
   /**
-  '* Returns true if the content-type will be rendered as plain-text.
+   * Returns true if the content-type will be rendered as plain-text.
    */
   static bool IsPlainTextType(const nsACString& aContentType);
 
   /**
+   * Returns true iff the type is rendered as plain text and doesn't support
+   * non-UTF-8 encodings.
+   */
+  static bool IsUtf8OnlyPlainTextType(const nsACString& aContentType);
+
+  /**
    * Get the script file name to use when compiling the script
    * referenced by aURI. In cases where there's no need for any extra
    * security wrapper automation the script file name that's returned
    * will be the spec in aURI, else it will be the spec in aDocument's
    * URI followed by aURI's spec, separated by " -> ". Returns true
    * if the script file name was modified, false if it's aURI's
    * spec.
    */
--- a/dom/html/nsHTMLDocument.cpp
+++ b/dom/html/nsHTMLDocument.cpp
@@ -545,16 +545,19 @@ nsHTMLDocument::StartDocumentLoad(const 
   bool html = contentType.EqualsLiteral(TEXT_HTML);
   bool xhtml = !html && (contentType.EqualsLiteral(APPLICATION_XHTML_XML) || contentType.EqualsLiteral(APPLICATION_WAPXHTML_XML));
   bool plainText = !html && !xhtml && nsContentUtils::IsPlainTextType(contentType);
   if (!(html || xhtml || plainText || viewSource)) {
     MOZ_ASSERT(false, "Channel with bad content type.");
     return NS_ERROR_INVALID_ARG;
   }
 
+  bool forceUtf8 = plainText &&
+    nsContentUtils::IsUtf8OnlyPlainTextType(contentType);
+
   bool loadAsHtml5 = true;
 
   if (!viewSource && xhtml) {
       // We're parsing XHTML as XML, remember that.
       mType = eXHTML;
       mCompatMode = eCompatibility_FullStandards;
       loadAsHtml5 = false;
   }
@@ -664,17 +667,22 @@ nsHTMLDocument::StartDocumentLoad(const 
     executor = static_cast<nsHtml5TreeOpExecutor*> (mParser->GetContentSink());
     if (mReferrerPolicySet) {
       // CSP may have set the referrer policy, so a speculative parser should
       // start with the new referrer policy.
       executor->SetSpeculationReferrerPolicy(static_cast<ReferrerPolicy>(mReferrerPolicy));
     }
   }
 
-  if (!IsHTMLDocument() || !docShell) { // no docshell for text/html XHR
+  if (forceUtf8) {
+    charsetSource = kCharsetFromUtf8OnlyMime;
+    charset.AssignLiteral("UTF-8");
+    parserCharsetSource = charsetSource;
+    parserCharset = charset;
+  } else if (!IsHTMLDocument() || !docShell) { // no docshell for text/html XHR
     charsetSource = IsHTMLDocument() ? kCharsetFromFallback
                                      : kCharsetFromDocTypeDefault;
     charset.AssignLiteral("UTF-8");
     TryChannelCharset(aChannel, charsetSource, charset, executor);
     parserCharsetSource = charsetSource;
     parserCharset = charset;
   } else {
     NS_ASSERTION(docShell, "Unexpected null value");
@@ -3613,17 +3621,17 @@ nsHTMLDocument::DocAddSizeOfExcludingThi
 
 bool
 nsHTMLDocument::WillIgnoreCharsetOverride()
 {
   if (mType != eHTML) {
     MOZ_ASSERT(mType == eXHTML);
     return true;
   }
-  if (mCharacterSetSource == kCharsetFromByteOrderMark) {
+  if (mCharacterSetSource >= kCharsetFromByteOrderMark) {
     return true;
   }
   if (!EncodingUtils::IsAsciiCompatible(mCharacterSet)) {
     return true;
   }
   nsCOMPtr<nsIWyciwygChannel> wyciwyg = do_QueryInterface(mChannel);
   if (wyciwyg) {
     return true;
new file mode 100644
--- /dev/null
+++ b/dom/html/reftests/741776-1-ref.html
@@ -0,0 +1,1 @@
+<meta charset=utf-8><pre>ää
new file mode 100644
--- /dev/null
+++ b/dom/html/reftests/741776-1.vtt
@@ -0,0 +1,1 @@
+ää
--- a/dom/html/reftests/reftest.list
+++ b/dom/html/reftests/reftest.list
@@ -22,16 +22,17 @@ include toblob-todataurl/reftest.list
 == 573322-no-quirks.html 573322-no-quirks-ref.html
 == 596455-1a.html 596455-ref-1.html
 == 596455-1b.html 596455-ref-1.html
 == 596455-2a.html 596455-ref-2.html
 == 596455-2b.html 596455-ref-2.html
 == 610935.html 610935-ref.html
 == 649134-1.html 649134-ref.html
 skip-if(Android) == 649134-2.html 649134-2-ref.html
+== 741776-1.vtt 741776-1-ref.html
 
 == bug448564-1_malformed.html bug448564-1_well-formed.html
 == bug448564-1_malformed.html bug448564-1_ideal.html
 
 == bug448564-4a.html          bug448564-4b.html
 == bug502168-1_malformed.html bug502168-1_well-formed.html
 
 == responsive-image-load-shortcircuit.html responsive-image-load-shortcircuit-ref.html
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@@ -976,23 +976,25 @@ nsHtml5StreamParser::OnStartRequest(nsIR
     mInitialEncodingWasFromParentFrame = true;
   }
 
   if (mCharsetSource >= kCharsetFromAutoDetection) {
     mFeedChardet = false;
   }
   
   nsCOMPtr<nsIWyciwygChannel> wyciwygChannel(do_QueryInterface(mRequest));
-  if (!wyciwygChannel) {
+  if (mCharsetSource < kCharsetFromUtf8OnlyMime && !wyciwygChannel) {
     // we aren't ready to commit to an encoding yet
     // leave converter uninstantiated for now
     return NS_OK;
   }
 
-  // We are reloading a document.open()ed doc.
+  // We are reloading a document.open()ed doc or loading JSON/WebVTT/etc. into
+  // a browsing context. In the latter case, there's no need to remove the
+  // BOM manually here, because the UTF-8 decoder removes it.
   mReparseForbidden = true;
   mFeedChardet = false;
 
   // Instantiate the converter here to avoid BOM sniffing.
   mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
   return NS_OK;
 }
 
--- a/parser/nsCharsetSource.h
+++ b/parser/nsCharsetSource.h
@@ -17,10 +17,11 @@
 #define kCharsetFromMetaPrescan         8 // this one and smaller: HTML5 Tentative
 #define kCharsetFromMetaTag             9 // this one and greater: HTML5 Confident
 #define kCharsetFromIrreversibleAutoDetection 10
 #define kCharsetFromChannel            11
 #define kCharsetFromOtherComponent     12
 #define kCharsetFromParentForced       13 // propagates to child frames
 #define kCharsetFromUserForced         14 // propagates to child frames
 #define kCharsetFromByteOrderMark      15
+#define kCharsetFromUtf8OnlyMime       16 // For JSON, WebVTT and such
 
 #endif /* nsCharsetSource_h_ */
--- a/uriloader/exthandler/nsExternalHelperAppService.cpp
+++ b/uriloader/exthandler/nsExternalHelperAppService.cpp
@@ -586,16 +586,19 @@ static const nsExtraMimeTypeEntry extraM
   { IMAGE_JPEG, "jpeg,jpg,jfif,pjpeg,pjp", "JPEG Image" },
   { IMAGE_PNG, "png", "PNG Image" },
   { IMAGE_APNG, "apng", "APNG Image" },
   { IMAGE_TIFF, "tiff,tif", "TIFF Image" },
   { IMAGE_XBM, "xbm", "XBM Image" },
   { IMAGE_SVG_XML, "svg", "Scalable Vector Graphics" },
   { MESSAGE_RFC822, "eml", "RFC-822 data" },
   { TEXT_PLAIN, "txt,text", "Text File" },
+  { APPLICATION_JSON, "json", "JavaScript Object Notation" },
+  { TEXT_VTT, "vtt", "Web Video Text Tracks" },
+  { TEXT_CACHE_MANIFEST, "appcache", "Application Cache Manifest" },
   { TEXT_HTML, "html,htm,shtml,ehtml", "HyperText Markup Language" },
   { "application/xhtml+xml", "xhtml,xht", "Extensible HyperText Markup Language" },
   { APPLICATION_MATHML_XML, "mml", "Mathematical Markup Language" },
   { APPLICATION_RDF, "rdf", "Resource Description Framework" },
   { TEXT_XUL, "xul", "XML-Based User Interface Language" },
   { TEXT_XML, "xml,xsl,xbl", "Extensible Markup Language" },
   { TEXT_CSS, "css", "Style Sheet" },
   { TEXT_VCARD, "vcf,vcard", "Contact Information" },