Bug 801487 - Remove encoding detection using BOM. r=sicking
authorMasatoshi Kimura <VYV03354@nifty.ne.jp>
Tue, 06 Nov 2012 18:23:14 -0500
changeset 121141 6c91d0bc259e203a2adb9fbe803d73b587eaaa67
parent 121140 4aeebb3cacfa9bd7093cd4580ea40037e58af7ea
child 121142 3788795c4a18c2e3fa2b4beec9ccd67b45665fac
push id273
push userlsblakk@mozilla.com
push dateThu, 14 Feb 2013 23:19:38 +0000
treeherdermozilla-release@c5e807a3f8b8 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssicking
bugs801487
milestone19.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 801487 - Remove encoding detection using BOM. r=sicking
dom/encoding/TextDecoder.cpp
dom/encoding/TextDecoder.h
dom/encoding/test/test_BOMEncoding.js
dom/encoding/test/test_TextDecoder.js
dom/encoding/test/unit/test_singlebytes.js
--- a/dom/encoding/TextDecoder.cpp
+++ b/dom/encoding/TextDecoder.cpp
@@ -16,47 +16,31 @@ static const PRUnichar kReplacementChar 
 void
 TextDecoder::Init(const nsAString& aEncoding,
                   const TextDecoderOptions& aFatal,
                   ErrorResult& aRv)
 {
   nsAutoString label(aEncoding);
   EncodingUtils::TrimSpaceCharacters(label);
 
-  // If label is a case-insensitive match for "utf-16"
-  // then set the internal useBOM flag.
-  if (label.LowerCaseEqualsLiteral("utf-16")) {
-    mUseBOM = true;
-    mIsUTF16Family = true;
-    mEncoding = "utf-16le";
-    // If BOM is used, we can't determine the converter yet.
-    return;
-  }
-
   // Let encoding be the result of getting an encoding from label.
   // If encoding is failure, throw a TypeError.
   if (!EncodingUtils::FindEncodingForLabel(label, mEncoding)) {
     aRv.ThrowTypeError(MSG_ENCODING_NOT_SUPPORTED, &label);
     return;
   }
 
   mIsUTF16Family = !strcmp(mEncoding, "utf-16le") ||
                    !strcmp(mEncoding, "utf-16be");
 
   // If the constructor is called with an options argument,
   // and the fatal property of the dictionary is set,
   // set the internal fatal flag of the decoder object.
   mFatal = aFatal.fatal;
 
-  CreateDecoder(aRv);
-}
-
-void
-TextDecoder::CreateDecoder(ErrorResult& aRv)
-{
   // Create a decoder object for mEncoding.
   nsCOMPtr<nsICharsetConverterManager> ccm =
     do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID);
   if (!ccm) {
     aRv.Throw(NS_ERROR_UNEXPECTED);
     return;
   }
 
@@ -67,22 +51,20 @@ TextDecoder::CreateDecoder(ErrorResult& 
   }
 
   if (mFatal) {
     mDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);
   }
 }
 
 void
-TextDecoder::ResetDecoder(bool aResetOffset)
+TextDecoder::ResetDecoder()
 {
   mDecoder->Reset();
-  if (aResetOffset) {
-    mOffset = 0;
-  }
+  mOffset = 0;
 }
 
 void
 TextDecoder::Decode(const ArrayBufferView* aView,
                     const TextDecodeOptions& aOptions,
                     nsAString& aOutDecodedString,
                     ErrorResult& aRv)
 {
@@ -186,32 +168,17 @@ TextDecoder::HandleBOM(const char*& aDat
   aLength -= 2 - mOffset;
   mOffset = 2;
 
   const char* encoding = "";
   if (!EncodingUtils::IdentifyDataOffset(mInitialBytes, 2, encoding) ||
       strcmp(encoding, mEncoding)) {
     // If the stream doesn't start with BOM or the BOM doesn't match the
     // encoding, feed a BOM to workaround decoder's bug (bug 634541).
-    if (!mUseBOM) {
-      FeedBytes(!strcmp(mEncoding, "utf-16le") ? "\xFF\xFE" : "\xFE\xFF");
-    }
-  }
-  if (mUseBOM) {
-    // Select a decoder corresponding to the BOM.
-    if (!*encoding) {
-      encoding = "utf-16le";
-    }
-    // If the endian has not been changed, reuse the decoder.
-    if (mDecoder && !strcmp(encoding, mEncoding)) {
-      ResetDecoder(false);
-    } else {
-      mEncoding = encoding;
-      CreateDecoder(aRv);
-    }
+    FeedBytes(!strcmp(mEncoding, "utf-16le") ? "\xFF\xFE" : "\xFE\xFF");
   }
   FeedBytes(mInitialBytes, &aOutString);
 }
 
 void
 TextDecoder::FeedBytes(const char* aBytes, nsAString* aOutString)
 {
   PRUnichar buf[3];
@@ -229,17 +196,17 @@ TextDecoder::FeedBytes(const char* aByte
 void
 TextDecoder::GetEncoding(nsAString& aEncoding)
 {
   // Our utf-16 converter does not comply with the Encoding Standard.
   // As a result the utf-16le converter is used for the encoding label
   // "utf-16".
   // This workaround should not be exposed to the public API and so "utf-16"
   // is returned by GetEncoding() if the internal encoding name is "utf-16le".
-  if (mUseBOM || !strcmp(mEncoding, "utf-16le")) {
+  if (!strcmp(mEncoding, "utf-16le")) {
     aEncoding.AssignLiteral("utf-16");
     return;
   }
 
   aEncoding.AssignASCII(mEncoding);
 }
 
 NS_IMPL_CYCLE_COLLECTING_ADDREF(TextDecoder)
--- a/dom/encoding/TextDecoder.h
+++ b/dom/encoding/TextDecoder.h
@@ -36,18 +36,17 @@ public:
     txtDecoder->Init(aEncoding, aFatal, aRv);
     if (aRv.Failed()) {
       return nullptr;
     }
     return txtDecoder.forget();
   }
 
   TextDecoder(nsISupports* aGlobal)
-    : mGlobal(aGlobal)
-    , mFatal(false), mUseBOM(false), mOffset(0), mIsUTF16Family(false)
+    : mGlobal(aGlobal), mFatal(false), mOffset(0), mIsUTF16Family(false)
   {
     MOZ_ASSERT(aGlobal);
     SetIsDOMBinding();
   }
 
   virtual
   ~TextDecoder()
   {}
@@ -92,17 +91,16 @@ public:
               nsAString& aOutDecodedString,
               ErrorResult& aRv);
 
 private:
   const char* mEncoding;
   nsCOMPtr<nsIUnicodeDecoder> mDecoder;
   nsCOMPtr<nsISupports> mGlobal;
   bool mFatal;
-  bool mUseBOM;
   uint8_t mOffset;
   char mInitialBytes[3];
   bool mIsUTF16Family;
 
   /**
    * Validates provided encoding and throws an exception if invalid encoding.
    * If no encoding is provided then mEncoding is default initialised to "utf-8".
    *
@@ -112,18 +110,17 @@ private:
    *                     exception or not.
    * @return aRv         EncodingError exception else null.
    */
   void Init(const nsAString& aEncoding,
             const TextDecoderOptions& aFatal,
             ErrorResult& aRv);
 
   // Internal helper functions.
-  void CreateDecoder(ErrorResult& aRv);
-  void ResetDecoder(bool aResetOffset = true);
+  void ResetDecoder();
   void HandleBOM(const char*& aData, uint32_t& aLength,
                  const TextDecodeOptions& aOptions,
                  nsAString& aOutString, ErrorResult& aRv);
   void FeedBytes(const char* aBytes, nsAString* aOutString = nullptr);
 };
 
 } // dom
 } // mozilla
--- a/dom/encoding/test/test_BOMEncoding.js
+++ b/dom/encoding/test/test_BOMEncoding.js
@@ -60,28 +60,27 @@ function testMoreBOMEncoding() {
                   msg: "test decoder invalid BOM encoding for utf-16be fatal."});
 
   testBOMCharset({encoding: "utf-16be", data: data, expected: "\ufffe" + expectedString,
                   msg: "test decoder invalid BOM encoding for utf-16be."});
 
   // Testing user provided encoding is UTF-16LE & bom encoding is utf-16be
   var dataUTF16 = [0xFE, 0xFF, 0x22, 0x00, 0x12, 0x04, 0x41, 0x04, 0x35, 0x04, 0x20, 0x00, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x4B, 0x04, 0x35, 0x04, 0x20, 0x00, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x38, 0x04, 0x20, 0x00, 0x3F, 0x04, 0x3E, 0x04, 0x45, 0x04, 0x3E, 0x04, 0x36, 0x04, 0x38, 0x04, 0x20, 0x00, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x30, 0x04, 0x20, 0x00, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x30, 0x04, 0x2C, 0x00, 0x20, 0x00, 0x3A, 0x04, 0x30, 0x04, 0x36, 0x04, 0x34, 0x04, 0x30, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x20, 0x00, 0x3F, 0x04, 0x3E, 0x04, 0x2D, 0x00, 0x41, 0x04, 0x32, 0x04, 0x3E, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x43, 0x04, 0x2E, 0x00, 0x22, 0x00];
   testBOMCharset({encoding: "utf-16le", fatal: true, data: dataUTF16, expected: "\ufffe" + expectedString,
-                  msg: "test decoder invalid BOM encoding for utf-16 fatal."});
+                  msg: "test decoder invalid BOM encoding for utf-16le fatal."});
 
   testBOMCharset({encoding: "utf-16le", data: dataUTF16, expected: "\ufffe" + expectedString,
-                  msg: "test decoder invalid BOM encoding for utf-16."});
+                  msg: "test decoder invalid BOM encoding for utf-16le."});
 
   // Testing user provided encoding is UTF-16 & bom encoding is utf-16be
-  data = [0xFE, 0xFF, 0x00, 0x22, 0x04, 0x12, 0x04, 0x41, 0x04, 0x35, 0x00, 0x20, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x4B, 0x04, 0x35, 0x00, 0x20, 0x04, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x38, 0x00, 0x20, 0x04, 0x3F, 0x04, 0x3E, 0x04, 0x45, 0x04, 0x3E, 0x04, 0x36, 0x04, 0x38, 0x00, 0x20, 0x04, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x00, 0x20, 0x04, 0x3D, 0x04, 0x30, 0x00, 0x20, 0x04, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x30, 0x00, 0x2C, 0x00, 0x20, 0x04, 0x3A, 0x04, 0x30, 0x04, 0x36, 0x04, 0x34, 0x04, 0x30, 0x04, 0x4F, 0x00, 0x20, 0x04, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x4F, 0x00, 0x20, 0x04, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x4F, 0x00, 0x20, 0x04, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x00, 0x20, 0x04, 0x3F, 0x04, 0x3E, 0x00, 0x2D, 0x04, 0x41, 0x04, 0x32, 0x04, 0x3E, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x43, 0x00, 0x2E, 0x00, 0x22];
-  testBOMCharset({encoding: "utf-16", fatal: true, data: data, expected: expectedString,
-                  msg: "test decoder BOM encoding for utf-16 fatal."});
+  testBOMCharset({encoding: "utf-16", fatal: true, data: dataUTF16, expected: "\ufffe" + expectedString,
+                  msg: "test decoder invalid BOM encoding for utf-16 fatal."});
 
-  testBOMCharset({encoding: "utf-16", data: data, expected: expectedString,
-                  msg: "test decoder BOM encoding for utf-16."});
+  testBOMCharset({encoding: "utf-16", data: dataUTF16, expected: "\ufffe" + expectedString,
+                  msg: "test decoder invalid BOM encoding for utf-16."});
 
   // Testing user provided encoding is UTF-16 & bom encoding is utf-16le
   dataUTF16 = [0xFF, 0xFE, 0x22, 0x00, 0x12, 0x04, 0x41, 0x04, 0x35, 0x04, 0x20, 0x00, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x4B, 0x04, 0x35, 0x04, 0x20, 0x00, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x38, 0x04, 0x20, 0x00, 0x3F, 0x04, 0x3E, 0x04, 0x45, 0x04, 0x3E, 0x04, 0x36, 0x04, 0x38, 0x04, 0x20, 0x00, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x30, 0x04, 0x20, 0x00, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x30, 0x04, 0x2C, 0x00, 0x20, 0x00, 0x3A, 0x04, 0x30, 0x04, 0x36, 0x04, 0x34, 0x04, 0x30, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x20, 0x00, 0x3F, 0x04, 0x3E, 0x04, 0x2D, 0x00, 0x41, 0x04, 0x32, 0x04, 0x3E, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x43, 0x04, 0x2E, 0x00, 0x22, 0x00];
   testBOMCharset({encoding: "utf-16", fatal: true, data: dataUTF16, expected: expectedString,
                   msg: "test decoder BOM encoding for utf-16 fatal."});
 
   testBOMCharset({encoding: "utf-16", data: dataUTF16, expected: expectedString,
                   msg: "test decoder BOM encoding for utf-16."});
--- a/dom/encoding/test/test_TextDecoder.js
+++ b/dom/encoding/test/test_TextDecoder.js
@@ -179,20 +179,20 @@ function testDecodeStreamCompositions() 
     {encoding: "utf-8", input: [0xC2,0x80], expected: ["","\x80"]},
     {encoding: "utf-8", input: [0xEF,0xBB,0xBF,0xC2,0x80], expected: ["","","","","\x80"]},
     {encoding: "utf-16", input: [0x01,0x00], expected: ["","\x01"]},
     {encoding: "utf-16", input: [0x01,0x00,0x03,0x02], expected: ["","\x01","","\u0203"]},
     {encoding: "utf-16", input: [0xFF,0xFE], expected: ["",""]},
     {encoding: "utf-16", input: [0xFF,0xFE,0x01,0x00], expected: ["","","","\x01"]},
     {encoding: "utf-16", input: [0xFF,0xFE,0xFF,0xFE], expected: ["","","","\uFEFF"]},
     {encoding: "utf-16", input: [0xFF,0xFE,0xFE,0xFF], expected: ["","","","\uFFFE"]},
-    {encoding: "utf-16", input: [0xFE,0xFF], expected: ["",""]},
-    {encoding: "utf-16", input: [0xFE,0xFF,0x01,0x00], expected: ["","","","\u0100"]},
-    {encoding: "utf-16", input: [0xFE,0xFF,0xFF,0xFE], expected: ["","","","\uFFFE"]},
-    {encoding: "utf-16", input: [0xFE,0xFF,0xFE,0xFF], expected: ["","","","\uFEFF"]},
+    {encoding: "utf-16", input: [0xFE,0xFF], expected: ["","\uFFFE"]},
+    {encoding: "utf-16", input: [0xFE,0xFF,0x01,0x00], expected: ["","\uFFFE","","\x01"]},
+    {encoding: "utf-16", input: [0xFE,0xFF,0xFF,0xFE], expected: ["","\uFFFE","","\uFEFF"]},
+    {encoding: "utf-16", input: [0xFE,0xFF,0xFE,0xFF], expected: ["","\uFFFE","","\uFFFE"]},
     {encoding: "utf-16le", input: [0x01,0x00], expected: ["","\x01"]},
     {encoding: "utf-16le", input: [0x01,0x00,0x03,0x02], expected: ["","\x01","","\u0203"]},
     {encoding: "utf-16le", input: [0xFF,0xFE,0x01,0x00], expected: ["","","","\x01"]},
     {encoding: "utf-16le", input: [0xFE,0xFF,0x01,0x00], expected: ["","\uFFFE","","\x01"]},
     {encoding: "utf-16be", input: [0x01,0x00], expected: ["","\u0100"]},
     {encoding: "utf-16be", input: [0x01,0x00,0x03,0x02], expected: ["","\u0100","","\u0302"]},
     {encoding: "utf-16be", input: [0xFF,0xFE,0x01,0x00], expected: ["","\uFFFE","","\u0100"]},
     {encoding: "utf-16be", input: [0xFE,0xFF,0x01,0x00], expected: ["","","","\u0100"]},
--- a/dom/encoding/test/unit/test_singlebytes.js
+++ b/dom/encoding/test/unit/test_singlebytes.js
@@ -268,17 +268,16 @@ test(
 
     var string = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; // z, cent, CJK water, G-Clef, Private-use character
 
     // Basic cases
     equal(TextDecoder('utf-8').decode(new Uint8Array(utf8)), string);
     equal(TextDecoder('utf-16le').decode(new Uint8Array(utf16le)), string);
     equal(TextDecoder('utf-16be').decode(new Uint8Array(utf16be)), string);
     equal(TextDecoder('utf-16').decode(new Uint8Array(utf16le)), string);
-    equal(TextDecoder('utf-16').decode(new Uint8Array(utf16be)), string);
 
     /*
     // TODO: New API?
     // Verify that BOM wins
     equal(stringEncoding.decode(new Uint8Array(utf8), 'utf-16le'), string);
     equal(stringEncoding.decode(new Uint8Array(utf8), 'utf-16be'), string);
     equal(stringEncoding.decode(new Uint8Array(utf16le), 'utf-8'), string);
     equal(stringEncoding.decode(new Uint8Array(utf16le), 'utf-16be'), string);