Bug 801487 - Remove encoding detection using BOM. r=sicking
authorMasatoshi Kimura <VYV03354@nifty.ne.jp>
Tue, 06 Nov 2012 18:23:14 -0500
changeset 117134 6c91d0bc259e203a2adb9fbe803d73b587eaaa67
parent 117133 4aeebb3cacfa9bd7093cd4580ea40037e58af7ea
child 117135 3788795c4a18c2e3fa2b4beec9ccd67b45665fac
push idunknown
push userunknown
push dateunknown
reviewerssicking
bugs801487
milestone19.0a1
Bug 801487 - Remove encoding detection using BOM. r=sicking
dom/encoding/TextDecoder.cpp
dom/encoding/TextDecoder.h
dom/encoding/test/test_BOMEncoding.js
dom/encoding/test/test_TextDecoder.js
dom/encoding/test/unit/test_singlebytes.js
--- a/dom/encoding/TextDecoder.cpp
+++ b/dom/encoding/TextDecoder.cpp
@@ -16,47 +16,31 @@ static const PRUnichar kReplacementChar 
 void
 TextDecoder::Init(const nsAString& aEncoding,
                   const TextDecoderOptions& aFatal,
                   ErrorResult& aRv)
 {
   nsAutoString label(aEncoding);
   EncodingUtils::TrimSpaceCharacters(label);
 
-  // If label is a case-insensitive match for "utf-16"
-  // then set the internal useBOM flag.
-  if (label.LowerCaseEqualsLiteral("utf-16")) {
-    mUseBOM = true;
-    mIsUTF16Family = true;
-    mEncoding = "utf-16le";
-    // If BOM is used, we can't determine the converter yet.
-    return;
-  }
-
   // Let encoding be the result of getting an encoding from label.
   // If encoding is failure, throw a TypeError.
   if (!EncodingUtils::FindEncodingForLabel(label, mEncoding)) {
     aRv.ThrowTypeError(MSG_ENCODING_NOT_SUPPORTED, &label);
     return;
   }
 
   mIsUTF16Family = !strcmp(mEncoding, "utf-16le") ||
                    !strcmp(mEncoding, "utf-16be");
 
   // If the constructor is called with an options argument,
   // and the fatal property of the dictionary is set,
   // set the internal fatal flag of the decoder object.
   mFatal = aFatal.fatal;
 
-  CreateDecoder(aRv);
-}
-
-void
-TextDecoder::CreateDecoder(ErrorResult& aRv)
-{
   // Create a decoder object for mEncoding.
   nsCOMPtr<nsICharsetConverterManager> ccm =
     do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID);
   if (!ccm) {
     aRv.Throw(NS_ERROR_UNEXPECTED);
     return;
   }
 
@@ -67,22 +51,20 @@ TextDecoder::CreateDecoder(ErrorResult& 
   }
 
   if (mFatal) {
     mDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);
   }
 }
 
 void
-TextDecoder::ResetDecoder(bool aResetOffset)
+TextDecoder::ResetDecoder()
 {
   mDecoder->Reset();
-  if (aResetOffset) {
-    mOffset = 0;
-  }
+  mOffset = 0;
 }
 
 void
 TextDecoder::Decode(const ArrayBufferView* aView,
                     const TextDecodeOptions& aOptions,
                     nsAString& aOutDecodedString,
                     ErrorResult& aRv)
 {
@@ -186,32 +168,17 @@ TextDecoder::HandleBOM(const char*& aDat
   aLength -= 2 - mOffset;
   mOffset = 2;
 
   const char* encoding = "";
   if (!EncodingUtils::IdentifyDataOffset(mInitialBytes, 2, encoding) ||
       strcmp(encoding, mEncoding)) {
     // If the stream doesn't start with BOM or the BOM doesn't match the
     // encoding, feed a BOM to workaround decoder's bug (bug 634541).
-    if (!mUseBOM) {
-      FeedBytes(!strcmp(mEncoding, "utf-16le") ? "\xFF\xFE" : "\xFE\xFF");
-    }
-  }
-  if (mUseBOM) {
-    // Select a decoder corresponding to the BOM.
-    if (!*encoding) {
-      encoding = "utf-16le";
-    }
-    // If the endian has not been changed, reuse the decoder.
-    if (mDecoder && !strcmp(encoding, mEncoding)) {
-      ResetDecoder(false);
-    } else {
-      mEncoding = encoding;
-      CreateDecoder(aRv);
-    }
+    FeedBytes(!strcmp(mEncoding, "utf-16le") ? "\xFF\xFE" : "\xFE\xFF");
   }
   FeedBytes(mInitialBytes, &aOutString);
 }
 
 void
 TextDecoder::FeedBytes(const char* aBytes, nsAString* aOutString)
 {
   PRUnichar buf[3];
@@ -229,17 +196,17 @@ TextDecoder::FeedBytes(const char* aByte
 void
 TextDecoder::GetEncoding(nsAString& aEncoding)
 {
   // Our utf-16 converter does not comply with the Encoding Standard.
   // As a result the utf-16le converter is used for the encoding label
   // "utf-16".
   // This workaround should not be exposed to the public API and so "utf-16"
   // is returned by GetEncoding() if the internal encoding name is "utf-16le".
-  if (mUseBOM || !strcmp(mEncoding, "utf-16le")) {
+  if (!strcmp(mEncoding, "utf-16le")) {
     aEncoding.AssignLiteral("utf-16");
     return;
   }
 
   aEncoding.AssignASCII(mEncoding);
 }
 
 NS_IMPL_CYCLE_COLLECTING_ADDREF(TextDecoder)
--- a/dom/encoding/TextDecoder.h
+++ b/dom/encoding/TextDecoder.h
@@ -36,18 +36,17 @@ public:
     txtDecoder->Init(aEncoding, aFatal, aRv);
     if (aRv.Failed()) {
       return nullptr;
     }
     return txtDecoder.forget();
   }
 
   TextDecoder(nsISupports* aGlobal)
-    : mGlobal(aGlobal)
-    , mFatal(false), mUseBOM(false), mOffset(0), mIsUTF16Family(false)
+    : mGlobal(aGlobal), mFatal(false), mOffset(0), mIsUTF16Family(false)
   {
     MOZ_ASSERT(aGlobal);
     SetIsDOMBinding();
   }
 
   virtual
   ~TextDecoder()
   {}
@@ -92,17 +91,16 @@ public:
               nsAString& aOutDecodedString,
               ErrorResult& aRv);
 
 private:
   const char* mEncoding;
   nsCOMPtr<nsIUnicodeDecoder> mDecoder;
   nsCOMPtr<nsISupports> mGlobal;
   bool mFatal;
-  bool mUseBOM;
   uint8_t mOffset;
   char mInitialBytes[3];
   bool mIsUTF16Family;
 
   /**
    * Validates provided encoding and throws an exception if invalid encoding.
    * If no encoding is provided then mEncoding is default initialised to "utf-8".
    *
@@ -112,18 +110,17 @@ private:
    *                     exception or not.
    * @return aRv         EncodingError exception else null.
    */
   void Init(const nsAString& aEncoding,
             const TextDecoderOptions& aFatal,
             ErrorResult& aRv);
 
   // Internal helper functions.
-  void CreateDecoder(ErrorResult& aRv);
-  void ResetDecoder(bool aResetOffset = true);
+  void ResetDecoder();
   void HandleBOM(const char*& aData, uint32_t& aLength,
                  const TextDecodeOptions& aOptions,
                  nsAString& aOutString, ErrorResult& aRv);
   void FeedBytes(const char* aBytes, nsAString* aOutString = nullptr);
 };
 
 } // dom
 } // mozilla
--- a/dom/encoding/test/test_BOMEncoding.js
+++ b/dom/encoding/test/test_BOMEncoding.js
@@ -60,28 +60,27 @@ function testMoreBOMEncoding() {
                   msg: "test decoder invalid BOM encoding for utf-16be fatal."});
 
   testBOMCharset({encoding: "utf-16be", data: data, expected: "\ufffe" + expectedString,
                   msg: "test decoder invalid BOM encoding for utf-16be."});
 
   // Testing user provided encoding is UTF-16LE & bom encoding is utf-16be
   var dataUTF16 = [0xFE, 0xFF, 0x22, 0x00, 0x12, 0x04, 0x41, 0x04, 0x35, 0x04, 0x20, 0x00, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x4B, 0x04, 0x35, 0x04, 0x20, 0x00, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x38, 0x04, 0x20, 0x00, 0x3F, 0x04, 0x3E, 0x04, 0x45, 0x04, 0x3E, 0x04, 0x36, 0x04, 0x38, 0x04, 0x20, 0x00, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x30, 0x04, 0x20, 0x00, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x30, 0x04, 0x2C, 0x00, 0x20, 0x00, 0x3A, 0x04, 0x30, 0x04, 0x36, 0x04, 0x34, 0x04, 0x30, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x20, 0x00, 0x3F, 0x04, 0x3E, 0x04, 0x2D, 0x00, 0x41, 0x04, 0x32, 0x04, 0x3E, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x43, 0x04, 0x2E, 0x00, 0x22, 0x00];
   testBOMCharset({encoding: "utf-16le", fatal: true, data: dataUTF16, expected: "\ufffe" + expectedString,
-                  msg: "test decoder invalid BOM encoding for utf-16 fatal."});
+                  msg: "test decoder invalid BOM encoding for utf-16le fatal."});
 
   testBOMCharset({encoding: "utf-16le", data: dataUTF16, expected: "\ufffe" + expectedString,
-                  msg: "test decoder invalid BOM encoding for utf-16."});
+                  msg: "test decoder invalid BOM encoding for utf-16le."});
 
   // Testing user provided encoding is UTF-16 & bom encoding is utf-16be
-  data = [0xFE, 0xFF, 0x00, 0x22, 0x04, 0x12, 0x04, 0x41, 0x04, 0x35, 0x00, 0x20, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x4B, 0x04, 0x35, 0x00, 0x20, 0x04, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x38, 0x00, 0x20, 0x04, 0x3F, 0x04, 0x3E, 0x04, 0x45, 0x04, 0x3E, 0x04, 0x36, 0x04, 0x38, 0x00, 0x20, 0x04, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x00, 0x20, 0x04, 0x3D, 0x04, 0x30, 0x00, 0x20, 0x04, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x30, 0x00, 0x2C, 0x00, 0x20, 0x04, 0x3A, 0x04, 0x30, 0x04, 0x36, 0x04, 0x34, 0x04, 0x30, 0x04, 0x4F, 0x00, 0x20, 0x04, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x4F, 0x00, 0x20, 0x04, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x4F, 0x00, 0x20, 0x04, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x00, 0x20, 0x04, 0x3F, 0x04, 0x3E, 0x00, 0x2D, 0x04, 0x41, 0x04, 0x32, 0x04, 0x3E, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x43, 0x00, 0x2E, 0x00, 0x22];
-  testBOMCharset({encoding: "utf-16", fatal: true, data: data, expected: expectedString,
-                  msg: "test decoder BOM encoding for utf-16 fatal."});
+  testBOMCharset({encoding: "utf-16", fatal: true, data: dataUTF16, expected: "\ufffe" + expectedString,
+                  msg: "test decoder invalid BOM encoding for utf-16 fatal."});
 
-  testBOMCharset({encoding: "utf-16", data: data, expected: expectedString,
-                  msg: "test decoder BOM encoding for utf-16."});
+  testBOMCharset({encoding: "utf-16", data: dataUTF16, expected: "\ufffe" + expectedString,
+                  msg: "test decoder invalid BOM encoding for utf-16."});
 
   // Testing user provided encoding is UTF-16 & bom encoding is utf-16le
   dataUTF16 = [0xFF, 0xFE, 0x22, 0x00, 0x12, 0x04, 0x41, 0x04, 0x35, 0x04, 0x20, 0x00, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x4B, 0x04, 0x35, 0x04, 0x20, 0x00, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x38, 0x04, 0x20, 0x00, 0x3F, 0x04, 0x3E, 0x04, 0x45, 0x04, 0x3E, 0x04, 0x36, 0x04, 0x38, 0x04, 0x20, 0x00, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x30, 0x04, 0x20, 0x00, 0x34, 0x04, 0x40, 0x04, 0x43, 0x04, 0x33, 0x04, 0x30, 0x04, 0x2C, 0x00, 0x20, 0x00, 0x3A, 0x04, 0x30, 0x04, 0x36, 0x04, 0x34, 0x04, 0x30, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x41, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x4C, 0x04, 0x4F, 0x04, 0x20, 0x00, 0x3D, 0x04, 0x35, 0x04, 0x41, 0x04, 0x47, 0x04, 0x30, 0x04, 0x41, 0x04, 0x42, 0x04, 0x3B, 0x04, 0x38, 0x04, 0x32, 0x04, 0x30, 0x04, 0x20, 0x00, 0x3F, 0x04, 0x3E, 0x04, 0x2D, 0x00, 0x41, 0x04, 0x32, 0x04, 0x3E, 0x04, 0x35, 0x04, 0x3C, 0x04, 0x43, 0x04, 0x2E, 0x00, 0x22, 0x00];
   testBOMCharset({encoding: "utf-16", fatal: true, data: dataUTF16, expected: expectedString,
                   msg: "test decoder BOM encoding for utf-16 fatal."});
 
   testBOMCharset({encoding: "utf-16", data: dataUTF16, expected: expectedString,
                   msg: "test decoder BOM encoding for utf-16."});
--- a/dom/encoding/test/test_TextDecoder.js
+++ b/dom/encoding/test/test_TextDecoder.js
@@ -179,20 +179,20 @@ function testDecodeStreamCompositions() 
     {encoding: "utf-8", input: [0xC2,0x80], expected: ["","\x80"]},
     {encoding: "utf-8", input: [0xEF,0xBB,0xBF,0xC2,0x80], expected: ["","","","","\x80"]},
     {encoding: "utf-16", input: [0x01,0x00], expected: ["","\x01"]},
     {encoding: "utf-16", input: [0x01,0x00,0x03,0x02], expected: ["","\x01","","\u0203"]},
     {encoding: "utf-16", input: [0xFF,0xFE], expected: ["",""]},
     {encoding: "utf-16", input: [0xFF,0xFE,0x01,0x00], expected: ["","","","\x01"]},
     {encoding: "utf-16", input: [0xFF,0xFE,0xFF,0xFE], expected: ["","","","\uFEFF"]},
     {encoding: "utf-16", input: [0xFF,0xFE,0xFE,0xFF], expected: ["","","","\uFFFE"]},
-    {encoding: "utf-16", input: [0xFE,0xFF], expected: ["",""]},
-    {encoding: "utf-16", input: [0xFE,0xFF,0x01,0x00], expected: ["","","","\u0100"]},
-    {encoding: "utf-16", input: [0xFE,0xFF,0xFF,0xFE], expected: ["","","","\uFFFE"]},
-    {encoding: "utf-16", input: [0xFE,0xFF,0xFE,0xFF], expected: ["","","","\uFEFF"]},
+    {encoding: "utf-16", input: [0xFE,0xFF], expected: ["","\uFFFE"]},
+    {encoding: "utf-16", input: [0xFE,0xFF,0x01,0x00], expected: ["","\uFFFE","","\x01"]},
+    {encoding: "utf-16", input: [0xFE,0xFF,0xFF,0xFE], expected: ["","\uFFFE","","\uFEFF"]},
+    {encoding: "utf-16", input: [0xFE,0xFF,0xFE,0xFF], expected: ["","\uFFFE","","\uFFFE"]},
     {encoding: "utf-16le", input: [0x01,0x00], expected: ["","\x01"]},
     {encoding: "utf-16le", input: [0x01,0x00,0x03,0x02], expected: ["","\x01","","\u0203"]},
     {encoding: "utf-16le", input: [0xFF,0xFE,0x01,0x00], expected: ["","","","\x01"]},
     {encoding: "utf-16le", input: [0xFE,0xFF,0x01,0x00], expected: ["","\uFFFE","","\x01"]},
     {encoding: "utf-16be", input: [0x01,0x00], expected: ["","\u0100"]},
     {encoding: "utf-16be", input: [0x01,0x00,0x03,0x02], expected: ["","\u0100","","\u0302"]},
     {encoding: "utf-16be", input: [0xFF,0xFE,0x01,0x00], expected: ["","\uFFFE","","\u0100"]},
     {encoding: "utf-16be", input: [0xFE,0xFF,0x01,0x00], expected: ["","","","\u0100"]},
--- a/dom/encoding/test/unit/test_singlebytes.js
+++ b/dom/encoding/test/unit/test_singlebytes.js
@@ -268,17 +268,16 @@ test(
 
     var string = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; // z, cent, CJK water, G-Clef, Private-use character
 
     // Basic cases
     equal(TextDecoder('utf-8').decode(new Uint8Array(utf8)), string);
     equal(TextDecoder('utf-16le').decode(new Uint8Array(utf16le)), string);
     equal(TextDecoder('utf-16be').decode(new Uint8Array(utf16be)), string);
     equal(TextDecoder('utf-16').decode(new Uint8Array(utf16le)), string);
-    equal(TextDecoder('utf-16').decode(new Uint8Array(utf16be)), string);
 
     /*
     // TODO: New API?
     // Verify that BOM wins
     equal(stringEncoding.decode(new Uint8Array(utf8), 'utf-16le'), string);
     equal(stringEncoding.decode(new Uint8Array(utf8), 'utf-16be'), string);
     equal(stringEncoding.decode(new Uint8Array(utf16le), 'utf-8'), string);
     equal(stringEncoding.decode(new Uint8Array(utf16le), 'utf-16be'), string);