Bug 421576 - Unpaired surrogate handled wrongly (Acid3 #68). acid3++ r=dbaron
authorJeff Walden <jwalden@mit.edu>
Mon, 02 Jun 2008 21:29:00 -0400
changeset 15210 645e72edd5c5aad193d7129a5d739511816f70e6
parent 15209 65860f70c8cec4959b7781b2e244de635e1f5051
child 15211 c5b6d415d822328376500147eaa066ff90ae7cc7
push idunknown
push userunknown
push dateunknown
reviewersdbaron
bugs421576
milestone1.9.1a1pre
Bug 421576 - Unpaired surrogate handled wrongly (Acid3 #68). acid3++ r=dbaron
layout/xul/base/src/nsMenuBarFrame.cpp
xpcom/string/public/nsUTF8Utils.h
xpcom/tests/Makefile.in
xpcom/tests/TestEncoding.cpp
xpcom/tests/TestHarness.h
--- a/layout/xul/base/src/nsMenuBarFrame.cpp
+++ b/layout/xul/base/src/nsMenuBarFrame.cpp
@@ -239,20 +239,19 @@ nsMenuBarFrame::FindMenuWithShortcut(nsI
 
     // See if it's a menu item.
     if (nsXULPopupManager::IsValidMenuItem(PresContext(), current, PR_FALSE)) {
       // Get the shortcut attribute.
       nsAutoString shortcutKey;
       current->GetAttr(kNameSpaceID_None, nsGkAtoms::accesskey, shortcutKey);
       if (!shortcutKey.IsEmpty()) {
         ToLowerCase(shortcutKey);
-        nsAutoString::const_iterator start, end;
-        shortcutKey.BeginReading(start);
-        shortcutKey.EndReading(end);
-        PRUint32 ch = UTF16CharEnumerator::NextChar(start, end);
+        const PRUnichar* start = shortcutKey.BeginReading();
+        const PRUnichar* end = shortcutKey.EndReading();
+        PRUint32 ch = UTF16CharEnumerator::NextChar(&start, end);
         PRUint32 index = accessKeys.IndexOf(ch);
         if (index != accessKeys.NoIndex &&
             (foundIndex == kNotFound || index < foundIndex)) {
           foundMenu = currFrame;
           foundIndex = index;
         }
       }
     }
--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@@ -330,23 +330,28 @@ public:
               *err = PR_FALSE;
             *buffer = p;
             return ucs4;
           }
         else
           {
             // Found a high surrogate followed by something other than
             // a low surrogate. Flag this as an error and return the
-            // Unicode replacement character 0xFFFD.
-
+            // Unicode replacement character 0xFFFD.  Note that the
+            // pointer to the next character points to the second 16-bit
+            // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
+            // only the first code unit of an illegal sequence must be
+            // treated as an illegally terminated code unit sequence
+            // (also Chapter 3 D91, "isolated [not paired and ill-formed]
+            // UTF-16 code units in the range D800..DFFF are ill-formed").
             NS_WARNING("got a High Surrogate but no low surrogate");
 
             if (err)
               *err = PR_TRUE;
-            *buffer = p;
+            *buffer = p - 1;
             return 0xFFFD;
           }
       }
     else // U+DC00 - U+DFFF
       {
         // DC00- DFFF - Low Surrogate
 
         // Found a low surrogate w/o a preceeding high surrogate. Flag
@@ -359,101 +364,16 @@ public:
         *buffer = p;
         return 0xFFFD;
       }
 
     if (err)
       *err = PR_TRUE;
     return 0;
   }
-
-#ifdef MOZILLA_INTERNAL_API
-
-  static PRUint32 NextChar(nsAString::const_iterator& iter,
-                           const nsAString::const_iterator& end,
-                           PRBool *err = nsnull)
-  {
-    if (iter == end)
-      {
-        if (err)
-          *err = PR_TRUE;
-
-        return 0;
-      }
-
-    PRUnichar c = *iter++;
-
-    if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
-      {
-        if (err)
-          *err = PR_FALSE;
-        return c;
-      }
-    else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
-      {
-        if (iter == end)
-          {
-            // Found a high surrogate the end of the buffer. Flag this
-            // as an error and return the Unicode replacement
-            // character 0xFFFD.
-
-            NS_WARNING("Unexpected end of buffer after high surrogate");
-
-            if (err)
-              *err = PR_TRUE;
-            return 0xFFFD;
-          }
-
-        // D800- DBFF - High Surrogate
-        PRUnichar h = c;
-
-        c = *iter++;
-
-        if (NS_IS_LOW_SURROGATE(c))
-          {
-            // DC00- DFFF - Low Surrogate
-            // N = (H - D800) *400 + 10000 + ( L - DC00 )
-            PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
-            if (err)
-              *err = PR_FALSE;
-            return ucs4;
-          }
-        else
-          {
-            // Found a high surrogate followed by something other than
-            // a low surrogate. Flag this as an error and return the
-            // Unicode replacement character 0xFFFD.
-
-            NS_WARNING("got a High Surrogate but no low surrogate");
-
-            if (err)
-              *err = PR_TRUE;
-            return 0xFFFD;
-          }
-      }
-    else // U+DC00 - U+DFFF
-      {
-        // DC00- DFFF - Low Surrogate
-
-        // Found a low surrogate w/o a preceeding high surrogate. Flag
-        // this as an error and return the Unicode replacement
-        // character 0xFFFD.
-
-        NS_WARNING("got a low Surrogate but no high surrogate");
-
-        if (err)
-          *err = PR_TRUE;
-        return 0xFFFD;
-      }
-
-    if (err)
-      *err = PR_TRUE;
-    return 0;
-  }
-#endif // MOZILLA_INTERNAL_API
 };
 
 
 /**
  * A character sink (see |copy_string| in nsAlgorithm.h) for converting
  * UTF-8 to UTF-16
  */
 class ConvertUTF8toUTF16
@@ -682,16 +602,25 @@ class ConvertUTF16toUTF8
                   {
                     // Treat broken characters as the Unicode
                     // replacement character 0xFFFD (0xEFBFBD in
                     // UTF-8)
                     *out++ = 0xEF;
                     *out++ = 0xBF;
                     *out++ = 0xBD;
 
+                    // The pointer to the next character points to the second
+                    // 16-bit value, not beyond it, as per Unicode 5.0.0
+                    // Chapter 3 C10, only the first code unit of an illegal
+                    // sequence must be treated as an illegally terminated
+                    // code unit sequence (also Chapter 3 D91, "isolated [not
+                    // paired and ill-formed] UTF-16 code units in the range
+                    // D800..DFFF are ill-formed").
+                    p--;
+
                     NS_WARNING("got a High Surrogate but no low surrogate");
                   }
               }
             else // U+DC00 - U+DFFF
               {
                 // Treat broken characters as the Unicode replacement
                 // character 0xFFFD (0xEFBFBD in UTF-8)
                 *out++ = 0xEF;
@@ -763,16 +692,25 @@ class CalculateUTF8Size
                   mSize += 4;
                 else
                   {
                     // Treat broken characters as the Unicode
                     // replacement character 0xFFFD (0xEFBFBD in
                     // UTF-8)
                     mSize += 3;
 
+                    // The next code unit is the second 16-bit value, not
+                    // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
+                    // only the first code unit of an illegal sequence must
+                    // be treated as an illegally terminated code unit
+                    // sequence (also Chapter 3 D91, "isolated [not paired and
+                    // ill-formed] UTF-16 code units in the range D800..DFFF
+                    // are ill-formed").
+                    p--;
+
                     NS_WARNING("got a high Surrogate but no low surrogate");
                   }
               }
             else // U+DC00 - U+DFFF
               {
                 // Treat broken characters as the Unicode replacement
                 // character 0xFFFD (0xEFBFBD in UTF-8)
                 mSize += 3;
--- a/xpcom/tests/Makefile.in
+++ b/xpcom/tests/Makefile.in
@@ -83,16 +83,17 @@ CPPSRCS		= \
 
 ifndef MOZ_ENABLE_LIBXUL
 CPPSRCS += \
 		TestArray.cpp \
 		TestTArray.cpp \
 		TestAtoms.cpp \
 		TestAutoLock.cpp \
 		TestCRT.cpp \
+		TestEncoding.cpp \
 		TestPermanentAtoms.cpp \
 		TestPipes.cpp \
 		TestThreads.cpp \
 		TestThreadPool.cpp \
 		TestXPIDLString.cpp \
 		TestDeque.cpp \
 		TestStrings.cpp \
 		TestStorageStream.cpp \
@@ -141,16 +142,17 @@ CPP_UNIT_TESTS = \
   TestTextFormatter \
   $(NULL)
 
 ifndef MOZ_ENABLE_LIBXUL
 CPP_UNIT_TESTS += \
   TestArray \
   TestAutoLock \
   TestCRT \
+  TestEncoding \
   TestExpirationTracker \
   TestPipes \
   TestProxies \
   TestThreads \
   TestThreadPool \
   TestXPIDLString \
   TestDeque \
   TestStrings \
new file mode 100644
--- /dev/null
+++ b/xpcom/tests/TestEncoding.cpp
@@ -0,0 +1,232 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is
+ * Jeff Walden <jwalden+code@mit.edu>.
+ * Portions created by the Initial Developer are Copyright (C) 2008
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either of the GNU General Public License Version 2 or later (the "GPL"),
+ * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "TestHarness.h"
+
+nsresult TestGoodSurrogatePair()
+{
+  // When this string is decoded, the surrogate pair is U+10302 and the rest of
+  // the string is specified by indexes 2 onward.
+  const PRUnichar goodPairData[] = {  0xD800, 0xDF02, 0x65, 0x78, 0x0 };
+  nsDependentString goodPair16(goodPairData);
+
+  PRUint32 byteCount = 0;
+  char* goodPair8 = ToNewUTF8String(goodPair16, &byteCount);
+  if (!goodPair8)
+  {
+    fail("out of memory creating goodPair8");
+    return NS_ERROR_OUT_OF_MEMORY;
+  }
+
+  if (byteCount != 6)
+  {
+    fail("wrong number of bytes; expected 6, got %lu", byteCount);
+    return NS_ERROR_FAILURE;
+  }
+
+  const char expected8[] = { 0xF0, 0x90, 0x8C, 0x82, 0x65, 0x78, 0x0 };
+  if (0 != memcmp(expected8, goodPair8, sizeof(expected8)))
+  {
+    fail("wrong translation to UTF8");
+    return NS_ERROR_FAILURE;
+  }
+
+  // This takes a different code path from the above, so test it to make sure
+  // the UTF-16 enumeration remains in sync with the UTF-8 enumeration.
+  nsDependentCString expected(expected8);
+  if (0 != CompareUTF8toUTF16(expected, goodPair16))
+  {
+    fail("bad comparison between UTF-8 and equivalent UTF-16");
+    return NS_ERROR_FAILURE;
+  }
+
+  NS_Free(goodPair8);
+
+  passed("TestGoodSurrogatePair");
+  return NS_OK;
+}
+
+nsresult TestBackwardsSurrogatePair()
+{
+  // When this string is decoded, the two surrogates are wrongly ordered and
+  // must each be interpreted as U+FFFD.
+  const PRUnichar backwardsPairData[] = { 0xDDDD, 0xD863, 0x65, 0x78, 0x0 };
+  nsDependentString backwardsPair16(backwardsPairData);
+
+  PRUint32 byteCount = 0;
+  char* backwardsPair8 = ToNewUTF8String(backwardsPair16, &byteCount);
+  if (!backwardsPair8)
+  {
+    fail("out of memory creating backwardsPair8");
+    return NS_ERROR_OUT_OF_MEMORY;
+  }
+
+  if (byteCount != 8)
+  {
+    fail("wrong number of bytes; expected 8, got %lu", byteCount);
+    return NS_ERROR_FAILURE;
+  }
+
+  const char expected8[] =
+    { 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0x65, 0x78, 0x0 };
+  if (0 != memcmp(expected8, backwardsPair8, sizeof(expected8)))
+  {
+    fail("wrong translation to UTF8");
+    return NS_ERROR_FAILURE;
+  }
+
+  // This takes a different code path from the above, so test it to make sure
+  // the UTF-16 enumeration remains in sync with the UTF-8 enumeration.
+  nsDependentCString expected(expected8);
+  if (0 != CompareUTF8toUTF16(expected, backwardsPair16))
+  {
+    fail("bad comparison between UTF-8 and malformed but equivalent UTF-16");
+    return NS_ERROR_FAILURE;
+  }
+
+  NS_Free(backwardsPair8);
+
+  passed("TestBackwardsSurrogatePair");
+  return NS_OK;
+}
+
+nsresult TestMalformedUTF16OrphanHighSurrogate()
+{
+  // When this string is decoded, the high surrogate should be replaced and the
+  // rest of the string is specified by indexes 1 onward.
+  const PRUnichar highSurrogateData[] = { 0xD863, 0x74, 0x65, 0x78, 0x74, 0x0 };
+  nsDependentString highSurrogate16(highSurrogateData);
+
+  PRUint32 byteCount = 0;
+  char* highSurrogate8 = ToNewUTF8String(highSurrogate16, &byteCount);
+  if (!highSurrogate8)
+  {
+    fail("out of memory creating highSurrogate8");
+    return NS_ERROR_OUT_OF_MEMORY;
+  }
+
+  if (byteCount != 7)
+  {
+    fail("wrong number of bytes; expected 7, got %lu", byteCount);
+    return NS_ERROR_FAILURE;
+  }
+
+  const char expected8[] = { 0xEF, 0xBF, 0xBD, 0x74, 0x65, 0x78, 0x74, 0x0 };
+  if (0 != memcmp(expected8, highSurrogate8, sizeof(expected8)))
+  {
+    fail("wrong translation to UTF8");
+    return NS_ERROR_FAILURE;
+  }
+
+  // This takes a different code path from the above, so test it to make sure
+  // the UTF-16 enumeration remains in sync with the UTF-8 enumeration.
+  nsDependentCString expected(expected8);
+  if (0 != CompareUTF8toUTF16(expected, highSurrogate16))
+  {
+    fail("bad comparison between UTF-8 and malformed but equivalent UTF-16");
+    return NS_ERROR_FAILURE;
+  }
+
+  NS_Free(highSurrogate8);
+
+  passed("TestMalformedUTF16OrphanHighSurrogate");
+  return NS_OK;
+}
+
+nsresult TestMalformedUTF16OrphanLowSurrogate()
+{
+  // When this string is decoded, the low surrogate should be replaced and the
+  // rest of the string is specified by indexes 1 onward.
+  const PRUnichar lowSurrogateData[] = { 0xDDDD, 0x74, 0x65, 0x78, 0x74, 0x0 };
+  nsDependentString lowSurrogate16(lowSurrogateData);
+
+  PRUint32 byteCount = 0;
+  char* lowSurrogate8 = ToNewUTF8String(lowSurrogate16, &byteCount);
+  if (!lowSurrogate8)
+  {
+    fail("out of memory creating lowSurrogate8");
+    return NS_ERROR_OUT_OF_MEMORY;
+  }
+
+  if (byteCount != 7)
+  {
+    fail("wrong number of bytes; expected 7, got %lu", byteCount);
+    return NS_ERROR_FAILURE;
+  }
+
+  const char expected8[] = { 0xEF, 0xBF, 0xBD, 0x74, 0x65, 0x78, 0x74, 0x0 };
+  if (0 != memcmp(expected8, lowSurrogate8, sizeof(expected8)))
+  {
+    fail("wrong translation to UTF8");
+    return NS_ERROR_FAILURE;
+  }
+
+  // This takes a different code path from the above, so test it to make sure
+  // the UTF-16 enumeration remains in sync with the UTF-8 enumeration.
+  nsDependentCString expected(expected8);
+  if (0 != CompareUTF8toUTF16(expected, lowSurrogate16))
+  {
+    fail("bad comparison between UTF-8 and malformed but equivalent UTF-16");
+    return NS_ERROR_FAILURE;
+  }
+
+  NS_Free(lowSurrogate8);
+
+  passed("TestMalformedUTF16OrphanLowSurrogate");
+  return NS_OK;
+}
+
+
+int main(int argc, char** argv)
+{
+  ScopedXPCOM xpcom("TestEncoding");
+  if (xpcom.failed())
+    return 1;
+
+  int rv = 0;
+
+  if (NS_FAILED(TestGoodSurrogatePair()))
+    rv = 1;
+  if (NS_FAILED(TestBackwardsSurrogatePair()))
+    rv = 1;
+  if (NS_FAILED(TestMalformedUTF16OrphanHighSurrogate()))
+    rv = 1;
+  if (NS_FAILED(TestMalformedUTF16OrphanLowSurrogate()))
+    rv = 1;
+
+  return rv;
+}
--- a/xpcom/tests/TestHarness.h
+++ b/xpcom/tests/TestHarness.h
@@ -32,55 +32,86 @@
  * and other provisions required by the GPL or the LGPL. If you do not delete
  * the provisions above, a recipient may use your version of this file under
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
 
 /*
  * Test harness for XPCOM objects, providing a scoped XPCOM initializer,
- * nsCOMPtr, nsRefPtr, do_CreateInstance, and stdio.h/stdlib.h.
+ * nsCOMPtr, nsRefPtr, do_CreateInstance, do_GetService, ns(Auto|C|)String,
+ * and stdio.h/stdlib.h.
  */
 
 #ifndef TestHarness_h__
 #define TestHarness_h__
 
-#include "nsIServiceManager.h"
 #include "nsComponentManagerUtils.h"
+#include "nsServiceManagerUtils.h"
 #include "nsCOMPtr.h"
 #include "nsAutoPtr.h"
+#include "nsStringGlue.h"
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdarg.h>
+
+/**
+ * Prints the given failure message and arguments using printf, prepending
+ * "FAIL " for the benefit of the test harness and appending "\n" to eliminate
+ * having to type it at each call site.
+ */
+void fail(const char* msg, ...)
+{
+  va_list ap;
+
+  printf("FAIL ");
+
+  va_start(ap, msg);
+  vprintf(msg, ap);
+  va_end(ap);
+
+  putchar('\n');
+}
+
+/**
+ * Prints the given string followed by " PASSED!\n", to be used at the end
+ * of a successful test function.
+ */
+void passed(const char* test)
+{
+  printf("%s PASSED!\n", test);
+}
+
 
 class ScopedXPCOM
 {
   public:
     ScopedXPCOM(const char* testName,
                 nsIDirectoryServiceProvider *dirSvcProvider = NULL)
     {
       mTestName = testName;
       printf("Running %s tests...\n", mTestName);
 
       nsresult rv = NS_InitXPCOM2(&mServMgr, NULL, dirSvcProvider);
       if (NS_FAILED(rv))
       {
-        printf("FAIL NS_InitXPCOM2 returned failure code %x\n", rv);
+        fail("NS_InitXPCOM2 returned failure code 0x%x", rv);
         mServMgr = NULL;
       }
     }
 
     ~ScopedXPCOM()
     {
       if (mServMgr)
       {
         NS_RELEASE(mServMgr);
         nsresult rv = NS_ShutdownXPCOM(NULL);
         if (NS_FAILED(rv))
         {
-          printf("FAIL XPCOM shutdown failed with code %x\n", rv);
+          fail("XPCOM shutdown failed with code 0x%x", rv);
           exit(1);
         }
       }
 
       printf("Finished running %s tests.\n", mTestName);
     }
 
     PRBool failed()