Bug 741776 - Treat JSON, WebVTT and AppCache manifests as UTF-8 when loaded as plain text. r=Ehsan
authorHenri Sivonen <hsivonen@hsivonen.fi>
Thu, 09 Jun 2016 14:29:30 +0300
changeset 372159 9f1de4aeae3a3495b6425668acbafbdf7eebb18b
parent 372158 9c229cf88ec224739f127082c1e8ec4c322bcd6f
child 372160 da9707f081f9ee54f59bc52da81f22540f64d313
push id1419
push userjlund@mozilla.com
push dateMon, 10 Apr 2017 20:44:07 +0000
treeherdermozilla-release@5e6801b73ef6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersEhsan
bugs741776
milestone53.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 741776 - Treat JSON, WebVTT and AppCache manifests as UTF-8 when loaded as plain text. r=Ehsan MozReview-Commit-ID: 5UvYqJVvX0r
dom/base/nsContentUtils.cpp
dom/base/nsContentUtils.h
dom/html/nsHTMLDocument.cpp
dom/html/reftests/741776-1-ref.html
dom/html/reftests/741776-1.vtt
dom/html/reftests/reftest.list
parser/html/nsHtml5StreamParser.cpp
parser/nsCharsetSource.h
uriloader/exthandler/nsExternalHelperAppService.cpp
--- a/dom/base/nsContentUtils.cpp
+++ b/dom/base/nsContentUtils.cpp
@@ -3753,16 +3753,26 @@ nsContentUtils::IsPlainTextType(const ns
   return aContentType.EqualsLiteral(TEXT_PLAIN) ||
          aContentType.EqualsLiteral(TEXT_CSS) ||
          aContentType.EqualsLiteral(TEXT_CACHE_MANIFEST) ||
          aContentType.EqualsLiteral(TEXT_VTT) ||
          IsScriptType(aContentType);
 }
 
 bool
+nsContentUtils::IsUtf8OnlyPlainTextType(const nsACString& aContentType)
+{
+  // NOTE: This must be a subset of the list in IsPlainTextType().
+  return aContentType.EqualsLiteral(TEXT_CACHE_MANIFEST) ||
+         aContentType.EqualsLiteral(APPLICATION_JSON) ||
+         aContentType.EqualsLiteral(TEXT_JSON) ||
+         aContentType.EqualsLiteral(TEXT_VTT);
+}
+
+bool
 nsContentUtils::GetWrapperSafeScriptFilename(nsIDocument* aDocument,
                                              nsIURI* aURI,
                                              nsACString& aScriptURI,
                                              nsresult* aRv)
 {
   MOZ_ASSERT(aRv);
   bool scriptFileNameModified = false;
   *aRv = NS_OK;
--- a/dom/base/nsContentUtils.h
+++ b/dom/base/nsContentUtils.h
@@ -1021,26 +1021,32 @@ public:
   static bool IsChromeDoc(nsIDocument *aDocument);
 
   /**
    * Returns true if aDocument is in a docshell whose parent is the same type
    */
   static bool IsChildOfSameType(nsIDocument* aDoc);
 
   /**
-  '* Returns true if the content-type is any of the supported script types.
+   * Returns true if the content-type is any of the supported script types.
    */
   static bool IsScriptType(const nsACString& aContentType);
 
   /**
-  '* Returns true if the content-type will be rendered as plain-text.
+   * Returns true if the content-type will be rendered as plain-text.
    */
   static bool IsPlainTextType(const nsACString& aContentType);
 
   /**
+   * Returns true iff the type is rendered as plain text and doesn't support
+   * non-UTF-8 encodings.
+   */
+  static bool IsUtf8OnlyPlainTextType(const nsACString& aContentType);
+
+  /**
    * Get the script file name to use when compiling the script
    * referenced by aURI. In cases where there's no need for any extra
    * security wrapper automation the script file name that's returned
    * will be the spec in aURI, else it will be the spec in aDocument's
    * URI followed by aURI's spec, separated by " -> ". Returns true
    * if the script file name was modified, false if it's aURI's
    * spec.
    */
--- a/dom/html/nsHTMLDocument.cpp
+++ b/dom/html/nsHTMLDocument.cpp
@@ -545,16 +545,19 @@ nsHTMLDocument::StartDocumentLoad(const 
   bool html = contentType.EqualsLiteral(TEXT_HTML);
   bool xhtml = !html && (contentType.EqualsLiteral(APPLICATION_XHTML_XML) || contentType.EqualsLiteral(APPLICATION_WAPXHTML_XML));
   bool plainText = !html && !xhtml && nsContentUtils::IsPlainTextType(contentType);
   if (!(html || xhtml || plainText || viewSource)) {
     MOZ_ASSERT(false, "Channel with bad content type.");
     return NS_ERROR_INVALID_ARG;
   }
 
+  bool forceUtf8 = plainText &&
+    nsContentUtils::IsUtf8OnlyPlainTextType(contentType);
+
   bool loadAsHtml5 = true;
 
   if (!viewSource && xhtml) {
       // We're parsing XHTML as XML, remember that.
       mType = eXHTML;
       mCompatMode = eCompatibility_FullStandards;
       loadAsHtml5 = false;
   }
@@ -664,17 +667,22 @@ nsHTMLDocument::StartDocumentLoad(const 
     executor = static_cast<nsHtml5TreeOpExecutor*> (mParser->GetContentSink());
     if (mReferrerPolicySet) {
       // CSP may have set the referrer policy, so a speculative parser should
       // start with the new referrer policy.
       executor->SetSpeculationReferrerPolicy(static_cast<ReferrerPolicy>(mReferrerPolicy));
     }
   }
 
-  if (!IsHTMLDocument() || !docShell) { // no docshell for text/html XHR
+  if (forceUtf8) {
+    charsetSource = kCharsetFromUtf8OnlyMime;
+    charset.AssignLiteral("UTF-8");
+    parserCharsetSource = charsetSource;
+    parserCharset = charset;
+  } else if (!IsHTMLDocument() || !docShell) { // no docshell for text/html XHR
     charsetSource = IsHTMLDocument() ? kCharsetFromFallback
                                      : kCharsetFromDocTypeDefault;
     charset.AssignLiteral("UTF-8");
     TryChannelCharset(aChannel, charsetSource, charset, executor);
     parserCharsetSource = charsetSource;
     parserCharset = charset;
   } else {
     NS_ASSERTION(docShell, "Unexpected null value");
@@ -3613,17 +3621,17 @@ nsHTMLDocument::DocAddSizeOfExcludingThi
 
 bool
 nsHTMLDocument::WillIgnoreCharsetOverride()
 {
   if (mType != eHTML) {
     MOZ_ASSERT(mType == eXHTML);
     return true;
   }
-  if (mCharacterSetSource == kCharsetFromByteOrderMark) {
+  if (mCharacterSetSource >= kCharsetFromByteOrderMark) {
     return true;
   }
   if (!EncodingUtils::IsAsciiCompatible(mCharacterSet)) {
     return true;
   }
   nsCOMPtr<nsIWyciwygChannel> wyciwyg = do_QueryInterface(mChannel);
   if (wyciwyg) {
     return true;
new file mode 100644
--- /dev/null
+++ b/dom/html/reftests/741776-1-ref.html
@@ -0,0 +1,1 @@
+<meta charset=utf-8><pre>ää
new file mode 100644
--- /dev/null
+++ b/dom/html/reftests/741776-1.vtt
@@ -0,0 +1,1 @@
+ää
--- a/dom/html/reftests/reftest.list
+++ b/dom/html/reftests/reftest.list
@@ -22,16 +22,17 @@ include toblob-todataurl/reftest.list
 == 573322-no-quirks.html 573322-no-quirks-ref.html
 == 596455-1a.html 596455-ref-1.html
 == 596455-1b.html 596455-ref-1.html
 == 596455-2a.html 596455-ref-2.html
 == 596455-2b.html 596455-ref-2.html
 == 610935.html 610935-ref.html
 == 649134-1.html 649134-ref.html
 skip-if(Android) == 649134-2.html 649134-2-ref.html
+== 741776-1.vtt 741776-1-ref.html
 
 == bug448564-1_malformed.html bug448564-1_well-formed.html
 == bug448564-1_malformed.html bug448564-1_ideal.html
 
 == bug448564-4a.html          bug448564-4b.html
 == bug502168-1_malformed.html bug502168-1_well-formed.html
 
 == responsive-image-load-shortcircuit.html responsive-image-load-shortcircuit-ref.html
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@@ -976,23 +976,25 @@ nsHtml5StreamParser::OnStartRequest(nsIR
     mInitialEncodingWasFromParentFrame = true;
   }
 
   if (mCharsetSource >= kCharsetFromAutoDetection) {
     mFeedChardet = false;
   }
   
   nsCOMPtr<nsIWyciwygChannel> wyciwygChannel(do_QueryInterface(mRequest));
-  if (!wyciwygChannel) {
+  if (mCharsetSource < kCharsetFromUtf8OnlyMime && !wyciwygChannel) {
     // we aren't ready to commit to an encoding yet
     // leave converter uninstantiated for now
     return NS_OK;
   }
 
-  // We are reloading a document.open()ed doc.
+  // We are reloading a document.open()ed doc or loading JSON/WebVTT/etc. into
+  // a browsing context. In the latter case, there's no need to remove the
+  // BOM manually here, because the UTF-8 decoder removes it.
   mReparseForbidden = true;
   mFeedChardet = false;
 
   // Instantiate the converter here to avoid BOM sniffing.
   mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
   return NS_OK;
 }
 
--- a/parser/nsCharsetSource.h
+++ b/parser/nsCharsetSource.h
@@ -17,10 +17,11 @@
 #define kCharsetFromMetaPrescan         8 // this one and smaller: HTML5 Tentative
 #define kCharsetFromMetaTag             9 // this one and greater: HTML5 Confident
 #define kCharsetFromIrreversibleAutoDetection 10
 #define kCharsetFromChannel            11
 #define kCharsetFromOtherComponent     12
 #define kCharsetFromParentForced       13 // propagates to child frames
 #define kCharsetFromUserForced         14 // propagates to child frames
 #define kCharsetFromByteOrderMark      15
+#define kCharsetFromUtf8OnlyMime       16 // For JSON, WebVTT and such
 
 #endif /* nsCharsetSource_h_ */
--- a/uriloader/exthandler/nsExternalHelperAppService.cpp
+++ b/uriloader/exthandler/nsExternalHelperAppService.cpp
@@ -586,16 +586,19 @@ static const nsExtraMimeTypeEntry extraM
   { IMAGE_JPEG, "jpeg,jpg,jfif,pjpeg,pjp", "JPEG Image" },
   { IMAGE_PNG, "png", "PNG Image" },
   { IMAGE_APNG, "apng", "APNG Image" },
   { IMAGE_TIFF, "tiff,tif", "TIFF Image" },
   { IMAGE_XBM, "xbm", "XBM Image" },
   { IMAGE_SVG_XML, "svg", "Scalable Vector Graphics" },
   { MESSAGE_RFC822, "eml", "RFC-822 data" },
   { TEXT_PLAIN, "txt,text", "Text File" },
+  { APPLICATION_JSON, "json", "JavaScript Object Notation" },
+  { TEXT_VTT, "vtt", "Web Video Text Tracks" },
+  { TEXT_CACHE_MANIFEST, "appcache", "Application Cache Manifest" },
   { TEXT_HTML, "html,htm,shtml,ehtml", "HyperText Markup Language" },
   { "application/xhtml+xml", "xhtml,xht", "Extensible HyperText Markup Language" },
   { APPLICATION_MATHML_XML, "mml", "Mathematical Markup Language" },
   { APPLICATION_RDF, "rdf", "Resource Description Framework" },
   { TEXT_XUL, "xul", "XML-Based User Interface Language" },
   { TEXT_XML, "xml,xsl,xbl", "Extensible Markup Language" },
   { TEXT_CSS, "css", "Style Sheet" },
   { TEXT_VCARD, "vcf,vcard", "Contact Information" },