Bug 511859 - Reject overlong UTF-8 encodings of code points rather than converting them to U+FFFD or the code point they supposedly describe. r=jwalden
authorMasahiro Yamada <masa141421356@gmail.com>
Fri, 13 Nov 2009 03:36:54 +0100
changeset 32906 e42c563313a0
parent 32905 72110846033e
child 32907 64c0d07ef768
push id674
push userrsayre@mozilla.com
push date2009-11-13 03:39 +0000
reviewersjwalden
bugs511859
milestone1.9.2b3pre
Bug 511859 - Reject overlong UTF-8 encodings of code points rather than converting them to U+FFFD or the code point they supposedly describe. r=jwalden
js/src/jsstr.cpp
--- a/js/src/jsstr.cpp
+++ b/js/src/jsstr.cpp
@@ -43,16 +43,18 @@
  *
  * In order to avoid unnecessary js_LockGCThing/js_UnlockGCThing calls, these
  * native methods store strings (possibly newborn) converted from their 'this'
  * parameter and arguments on the stack: 'this' conversions at argv[-1], arg
  * conversions at their index (argv[0], argv[1]).  This is a legitimate method
  * of rooting things that might lose their newborn root due to subsequent GC
  * allocations in the same native method.
  */
+#define __STDC_LIMIT_MACROS
+
 #include <stdlib.h>
 #include <string.h>
 #include "jstypes.h"
 #include "jsstdint.h"
 #include "jsutil.h" /* Added by JSIFY */
 #include "jshash.h" /* Added by JSIFY */
 #include "jsprf.h"
 #include "jsapi.h"
@@ -292,16 +294,18 @@ static JSBool
 str_decodeURI_Component(JSContext *cx, uintN argc, jsval *vp);
 
 static JSBool
 str_encodeURI(JSContext *cx, uintN argc, jsval *vp);
 
 static JSBool
 str_encodeURI_Component(JSContext *cx, uintN argc, jsval *vp);
 
+static const uint32 OVERLONG_UTF8 = UINT32_MAX;
+
 static uint32
 Utf8ToOneUcs4Char(const uint8 *utf8Buffer, int utf8Length);
 
 /*
  * Contributions from the String class to the set of methods defined for the
  * global object.  escape and unescape used to be defined in the Mocha library,
  * but as ECMA decided to spec them, they've been moved to the core engine
  * and made ECMA-compliant.  (Incomplete escapes are interpreted as literal
@@ -3638,17 +3642,17 @@ js_InflateStringToBuffer(JSContext *cx, 
     while (srclen) {
         v = (uint8) *src;
         n = 1;
         if (v & 0x80) {
             while (v & (0x80 >> n))
                 n++;
             if (n > srclen)
                 goto bufferTooSmall;
-            if (n == 1 || n > 6)
+            if (n == 1 || n > 4)
                 goto badCharacter;
             for (j = 1; j < n; j++) {
                 if ((src[j] & 0xC0) != 0x80)
                     goto badCharacter;
             }
             v = Utf8ToOneUcs4Char((uint8 *)src, n);
             if (v >= 0x10000) {
                 v -= 0x10000;
@@ -5157,17 +5161,17 @@ static JSBool
 Encode(JSContext *cx, JSString *str, const jschar *unescapedSet,
        const jschar *unescapedSet2, jsval *rval)
 {
     size_t length, j, k, L;
     JSCharBuffer cb(cx);
     const jschar *chars;
     jschar c, c2;
     uint32 v;
-    uint8 utf8buf[6];
+    uint8 utf8buf[4];
     jschar hexBuf[4];
     static const char HexDigits[] = "0123456789ABCDEF"; /* NB: uppercase */
 
     str->getCharsAndLength(chars, length);
     if (length == 0) {
         *rval = STRING_TO_JSVAL(cx->runtime->emptyString);
         return JS_TRUE;
     }
@@ -5221,17 +5225,17 @@ static JSBool
 Decode(JSContext *cx, JSString *str, const jschar *reservedSet, jsval *rval)
 {
     size_t length, start, k;
     JSCharBuffer cb(cx);
     const jschar *chars;
     jschar c, H;
     uint32 v;
     jsuint B;
-    uint8 octets[6];
+    uint8 octets[4];
     intN j, n;
 
     str->getCharsAndLength(chars, length);
     if (length == 0) {
         *rval = STRING_TO_JSVAL(cx->runtime->emptyString);
         return JS_TRUE;
     }
 
@@ -5247,17 +5251,17 @@ Decode(JSContext *cx, JSString *str, con
             B = JS7_UNHEX(chars[k+1]) * 16 + JS7_UNHEX(chars[k+2]);
             k += 2;
             if (!(B & 0x80)) {
                 c = (jschar)B;
             } else {
                 n = 1;
                 while (B & (0x80 >> n))
                     n++;
-                if (n == 1 || n > 6)
+                if (n == 1 || n > 4)
                     goto report_bad_uri;
                 octets[0] = (uint8)B;
                 if (k + 3 * (n - 1) >= length)
                     goto report_bad_uri;
                 for (j = 1; j < n; j++) {
                     k++;
                     if (chars[k] != '%')
                         goto report_bad_uri;
@@ -5346,24 +5350,24 @@ str_encodeURI_Component(JSContext *cx, u
     str = ArgToRootedString(cx, argc, vp, 0);
     if (!str)
         return JS_FALSE;
     return Encode(cx, str, js_uriUnescaped_ucstr, NULL, vp);
 }
 
 /*
  * Convert one UCS-4 char and write it into a UTF-8 buffer, which must be at
- * least 6 bytes long.  Return the number of UTF-8 bytes of data written.
+ * least 4 bytes long.  Return the number of UTF-8 bytes of data written.
  */
 int
 js_OneUcs4ToUtf8Char(uint8 *utf8Buffer, uint32 ucs4Char)
 {
     int utf8Length = 1;
 
-    JS_ASSERT(ucs4Char <= 0x7FFFFFFF);
+    JS_ASSERT(ucs4Char <= 0x10FFFF);
     if (ucs4Char < 0x80) {
         *utf8Buffer = (uint8)ucs4Char;
     } else {
         int i;
         uint32 a = ucs4Char >> 11;
         utf8Length = 2;
         while (a) {
             a >>= 5;
@@ -5386,34 +5390,35 @@ js_OneUcs4ToUtf8Char(uint8 *utf8Buffer, 
  */
 static uint32
 Utf8ToOneUcs4Char(const uint8 *utf8Buffer, int utf8Length)
 {
     uint32 ucs4Char;
     uint32 minucs4Char;
     /* from Unicode 3.1, non-shortest form is illegal */
     static const uint32 minucs4Table[] = {
-        0x00000080, 0x00000800, 0x0001000, 0x0020000, 0x0400000
+        0x00000080, 0x00000800, 0x00010000
     };
 
-    JS_ASSERT(utf8Length >= 1 && utf8Length <= 6);
+    JS_ASSERT(utf8Length >= 1 && utf8Length <= 4);
     if (utf8Length == 1) {
         ucs4Char = *utf8Buffer;
         JS_ASSERT(!(ucs4Char & 0x80));
     } else {
         JS_ASSERT((*utf8Buffer & (0x100 - (1 << (7-utf8Length)))) ==
                   (0x100 - (1 << (8-utf8Length))));
         ucs4Char = *utf8Buffer++ & ((1<<(7-utf8Length))-1);
         minucs4Char = minucs4Table[utf8Length-2];
         while (--utf8Length) {
             JS_ASSERT((*utf8Buffer & 0xC0) == 0x80);
             ucs4Char = ucs4Char<<6 | (*utf8Buffer++ & 0x3F);
         }
-        if (ucs4Char < minucs4Char ||
-            ucs4Char == 0xFFFE || ucs4Char == 0xFFFF) {
+        if (JS_UNLIKELY(ucs4Char < minucs4Char)) {
+            ucs4Char = OVERLONG_UTF8;
+        } else if (ucs4Char == 0xFFFE || ucs4Char == 0xFFFF) {
             ucs4Char = 0xFFFD;
         }
     }
     return ucs4Char;
 }
 
 #if defined DEBUG || defined JS_DUMP_PROPTREE_STATS