Bug 638379 - Part 1: Implement kOnError_Recover to the UTF-8 decoder. r=smontagu
authorMasatoshi Kimura <VYV03354@nifty.ne.jp>
Mon, 10 Dec 2012 09:10:28 -0500
changeset 115517 78b0678417f94d4d0af041228f510f50d9349873
parent 115516 21d667b49f3aade2a0fec41f8ac632f208c76bae
child 115518 f2d8e0807127f1a4a8437de9d8180d74adb1051b
push id24015
push useremorley@mozilla.com
push dateTue, 11 Dec 2012 15:51:15 +0000
treeherdermozilla-central@87f8165c5a0b [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssmontagu
bugs638379
milestone20.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 638379 - Part 1: Implement kOnError_Recover to the UTF-8 decoder. r=smontagu
intl/locale/public/nsCharsetAlias.h
intl/uconv/src/nsScriptableUConv.cpp
intl/uconv/src/nsUTF8ToUnicode.cpp
--- a/intl/locale/public/nsCharsetAlias.h
+++ b/intl/locale/public/nsCharsetAlias.h
@@ -5,19 +5,21 @@
 
 #ifndef nsCharsetAlias_h___
 #define nsCharsetAlias_h___
 
 #include "nscore.h"
 #include "nsStringGlue.h"
 
 class nsCharsetConverterManager;
+class nsScriptableUnicodeConverter;
 
 class nsCharsetAlias
 {
    friend class nsCharsetConverterManager;
+   friend class nsScriptableUnicodeConverter;
    static nsresult GetPreferredInternal(const nsACString& aAlias, nsACString& aResult);
 public:
    static nsresult GetPreferred(const nsACString& aAlias, nsACString& aResult);
    static nsresult Equals(const nsACString& aCharset1, const nsACString& aCharset2, bool* aResult);
 };
 
 #endif /* nsCharsetAlias_h___ */
--- a/intl/uconv/src/nsScriptableUConv.cpp
+++ b/intl/uconv/src/nsScriptableUConv.cpp
@@ -8,16 +8,17 @@
 #include "nsReadableUtils.h"
 #include "nsIServiceManager.h"
 #include "nsICharsetConverterManager.h"
 #include "nsIScriptableUConv.h"
 #include "nsScriptableUConv.h"
 #include "nsIStringStream.h"
 #include "nsCRT.h"
 #include "nsComponentManagerUtils.h"
+#include "nsCharsetAlias.h"
 
 static int32_t          gInstanceCount = 0;
 
 /* Implementation file */
 NS_IMPL_ISUPPORTS1(nsScriptableUnicodeConverter, nsIScriptableUnicodeConverter)
 
 nsScriptableUnicodeConverter::nsScriptableUnicodeConverter()
 : mIsInternal(false)
@@ -252,28 +253,45 @@ nsScriptableUnicodeConverter::SetIsInter
 
 nsresult
 nsScriptableUnicodeConverter::InitConverter()
 {
   nsresult rv = NS_OK;
   mEncoder = nullptr;
 
   nsCOMPtr<nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
+  if (NS_FAILED(rv) || !ccm) {
+    return rv;
+  }
 
-  if (NS_SUCCEEDED(rv) && ccm) {
-    // get charset atom due to getting unicode converter
-    
-    // get an unicode converter
-    rv = ccm->GetUnicodeEncoder(mCharset.get(), getter_AddRefs(mEncoder));
-    if(NS_SUCCEEDED(rv)) {
-      rv = mEncoder->SetOutputErrorBehavior(nsIUnicodeEncoder::kOnError_Replace, nullptr, (PRUnichar)'?');
-      if(NS_SUCCEEDED(rv)) {
-        rv = mIsInternal ?
-          ccm->GetUnicodeDecoderInternal(mCharset.get(),
-                                         getter_AddRefs(mDecoder)) :
-          ccm->GetUnicodeDecoder(mCharset.get(),
-                                 getter_AddRefs(mDecoder));
-      }
-    }
+  // get an unicode converter
+  rv = ccm->GetUnicodeEncoder(mCharset.get(), getter_AddRefs(mEncoder));
+  if (NS_FAILED(rv)) {
+    return rv;
+  }
+
+  rv = mEncoder->SetOutputErrorBehavior(nsIUnicodeEncoder::kOnError_Replace, nullptr, (PRUnichar)'?');
+  if (NS_FAILED(rv)) {
+    return rv;
+  }
+
+  nsAutoCString charset;
+  rv = mIsInternal ? nsCharsetAlias::GetPreferredInternal(mCharset, charset)
+                   : nsCharsetAlias::GetPreferred(mCharset, charset);
+  if (NS_FAILED(rv)) {
+    return rv;
+  }
+
+  rv = ccm->GetUnicodeDecoderRaw(charset.get(), getter_AddRefs(mDecoder));
+  if (NS_FAILED(rv)) {
+    return rv;
+  }
+
+  // The UTF-8 decoder used to throw regardless of the error behavior.
+  // Simulating the old behavior for compatibility with legacy callers
+  // (including addons). If callers want a control over the behavior,
+  // they should switch to TextDecoder.
+  if (charset.EqualsLiteral("UTF-8")) {
+    mDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);
   }
 
   return rv ;
 }
--- a/intl/uconv/src/nsUTF8ToUnicode.cpp
+++ b/intl/uconv/src/nsUTF8ToUnicode.cpp
@@ -183,22 +183,21 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(c
 
   PRUnichar *out, *outend;
   outend = aDest + aDestLen;
 
   nsresult res = NS_OK; // conversion result
 
   out = aDest;
   if (mState == 0xFF) {
-    // Emit supplementary character left over from previous iteration. If the
-    // buffer size is insufficient, treat it as an illegal character.
+    // Emit supplementary character left over from previous iteration. It is
+    // caller's responsibility to keep a sufficient buffer.
     if (aDestLen < 2) {
-      NS_ERROR("Output buffer insufficient to hold supplementary character");
-      mState = 0;
-      return NS_ERROR_ILLEGAL_INPUT;
+      *aSrcLength = *aDestLength = 0;
+      return NS_OK_UDEC_MOREOUTPUT;
     }
     out = EmitSurrogatePair(mUcs4, out);
     mUcs4 = 0;
     mState = 0;
     mBytes = 1;
     mFirst = false;
   }
 
@@ -220,18 +219,22 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(c
       // multi-octet sequence.
       if (c < 0x80) {  // 00..7F
         int32_t max_loops = NS_MIN(inend - in, outend - out);
         Convert_ascii_run(in, out, max_loops);
         --in; // match the rest of the cases
         mBytes = 1;
       } else if (c < 0xC2) {  // C0/C1
         // Overlong 2 octet sequence
-        res = NS_ERROR_ILLEGAL_INPUT;
-        break;
+        if (mErrBehavior == kOnError_Signal) {
+          res = NS_ERROR_ILLEGAL_INPUT;
+          break;
+        }
+        *out++ = UCS2_REPLACEMENT_CHAR;
+        mFirst = false;
       } else if (c < 0xE0) {  // C2..DF
         // First octet of 2 octet sequence
         mUcs4 = c;
         mUcs4 = (mUcs4 & 0x1F) << 6;
         mState = 1;
         mBytes = 2;
       } else if (c < 0xF0) {  // E0..EF
         // First octet of 3 octet sequence
@@ -243,40 +246,50 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(c
         // First octet of 4 octet sequence
         mUcs4 = c;
         mUcs4 = (mUcs4 & 0x07) << 18;
         mState = 3;
         mBytes = 4;
       } else {  // F5..FF
         /* Current octet is neither in the US-ASCII range nor a legal first
          * octet of a multi-octet sequence.
-         *
-         * Return an error condition. Caller is responsible for flushing and
-         * refilling the buffer and resetting state.
          */
-        res = NS_ERROR_ILLEGAL_INPUT;
-        break;
+        if (mErrBehavior == kOnError_Signal) {
+          /* Return an error condition. Caller is responsible for flushing and
+           * refilling the buffer and resetting state.
+           */
+          res = NS_ERROR_ILLEGAL_INPUT;
+          break;
+        }
+        *out++ = UCS2_REPLACEMENT_CHAR;
+        mFirst = false;
       }
     } else {
       // When mState is non-zero, we expect a continuation of the multi-octet
       // sequence
       if (0x80 == (0xC0 & c)) {
         if (mState > 1) {
           // If we are here, all possibilities are:
           // mState == 2 && mBytes == 3 ||
           // mState == 2 && mBytes == 4 ||
           // mState == 3 && mBytes == 4
           if (mBytes == 3 && (!mUcs4 && c < 0xA0 ||  // E0 80..9F
                               mUcs4 == 0xD000 && c > 0x9F) ||  // ED A0..BF
               mState == 3 && (!mUcs4 && c < 0x90 ||  // F0 80..8F
                               mUcs4 == 0x100000 && c > 0x8F)) {  // F4 90..BF
             // illegal sequences or sequences converted into illegal ranges.
             in--;
-            res = NS_ERROR_ILLEGAL_INPUT;
-            break;
+            if (mErrBehavior == kOnError_Signal) {
+              res = NS_ERROR_ILLEGAL_INPUT;
+              break;
+            }
+            *out++ = UCS2_REPLACEMENT_CHAR;
+            mState = 0;
+            mFirst = false;
+            continue;
           }
         }
 
         // Legal continuation.
         uint32_t shift = (mState - 1) * 6;
         uint32_t tmp = c;
         tmp = (tmp & 0x0000003FL) << shift;
         mUcs4 |= tmp;
@@ -310,18 +323,23 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(c
       } else {
         /* ((0xC0 & c != 0x80) && (mState != 0))
          * 
          * Incomplete multi-octet sequence. Unconsume this
          * octet and return an error condition. Caller is responsible
          * for flushing and refilling the buffer and resetting state.
          */
         in--;
-        res = NS_ERROR_ILLEGAL_INPUT;
-        break;
+        if (mErrBehavior == kOnError_Signal) {
+          res = NS_ERROR_ILLEGAL_INPUT;
+          break;
+        }
+        *out++ = UCS2_REPLACEMENT_CHAR;
+        mState = 0;
+        mFirst = false;
       }
     }
   }
 
   // output not finished, output buffer too short
   if ((NS_OK == res) && (in < inend) && (out >= outend))
     res = NS_OK_UDEC_MOREOUTPUT;