Bug 1135377 - Part 8: Disallow extended pattern in RegExp with unicode flag. r=till, f=anba
authorTooru Fujisawa <arai_a@mac.com>
Fri, 07 Aug 2015 08:12:51 +0900
changeset 277063 3bd3d3ed5fe4ffc440f6e9ae2d2161481034daae
parent 277062 93113598bb3b40b26f8c141b11522ffbcaf29ceb
child 277064 5b0ced0284a9e13609fad337abc442a290ee30de
push id16724
push usercbook@mozilla.com
push dateMon, 21 Dec 2015 11:00:52 +0000
treeherderfx-team@3f3f0361567c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstill
bugs1135377
milestone46.0a1
Bug 1135377 - Part 8: Disallow extended pattern in RegExp with unicode flag. r=till, f=anba
js/src/irregexp/RegExpParser.cpp
js/src/js.msg
js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js
js/src/vm/Xdr.h
--- a/js/src/irregexp/RegExpParser.cpp
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -400,16 +400,41 @@ RangeAtom(LifoAlloc* alloc, char16_t fro
 }
 
 static inline RegExpTree*
 NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to)
 {
     return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0);
 }
 
+static bool
+IsSyntaxCharacter(widechar c)
+{
+  switch (c) {
+    case '^':
+    case '$':
+    case '\\':
+    case '.':
+    case '*':
+    case '+':
+    case '?':
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case '{':
+    case '}':
+    case '|':
+    case '/':
+      return true;
+    default:
+      return false;
+  }
+}
+
 #ifdef DEBUG
 // Currently only used in an assert.kASSERT.
 static bool
 IsSpecialClassEscape(widechar c)
 {
   switch (c) {
     case 'd': case 'D':
     case 's': case 'S':
@@ -454,47 +479,67 @@ RegExpParser<CharT>::ParseClassCharacter
       case 'v':
         Advance();
         *code = '\v';
         return true;
       case 'c': {
         widechar controlLetter = Next();
         widechar letter = controlLetter & ~('A' ^ 'a');
         // For compatibility with JSC, inside a character class
-        // we also accept digits and underscore as control characters.
-        if ((controlLetter >= '0' && controlLetter <= '9') ||
-            controlLetter == '_' ||
-            (letter >= 'A' && letter <= 'Z')) {
+        // we also accept digits and underscore as control characters,
+        // but only in non-unicode mode
+        if ((!unicode_ &&
+             ((controlLetter >= '0' && controlLetter <= '9') ||
+              controlLetter == '_')) ||
+            (letter >= 'A' && letter <= 'Z'))
+        {
             Advance(2);
             // Control letters mapped to ASCII control characters in the range
             // 0x00-0x1f.
             *code = controlLetter & 0x1f;
             return true;
         }
+        if (unicode_) {
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
         // We match JSC in reading the backslash as a literal
         // character instead of as starting an escape.
         *code = '\\';
         return true;
       }
       case '0': case '1': case '2': case '3': case '4': case '5':
       case '6': case '7':
-        // For compatibility, we interpret a decimal escape that isn't
-        // a back reference (and therefore either \0 or not valid according
-        // to the specification) as a 1..3 digit octal character code.
+        if (unicode_) {
+            if (current() == '0') {
+                *code = 0;
+                return true;
+            }
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
+        // For compatibility, outside of unicode mode, we interpret a decimal
+        // escape that isn't a back reference (and therefore either \0 or not
+        // valid according to the specification) as a 1..3 digit octal
+        // character code.
         *code = ParseOctalLiteral();
         return true;
       case 'x': {
         Advance();
         size_t value;
         if (ParseHexEscape(2, &value)) {
             *code = value;
             return true;
         }
+        if (unicode_) {
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
         // If \x is not followed by a two-digit hexadecimal, treat it
-        // as an identity escape.
+        // as an identity escape in non-unicode mode.
         *code = 'x';
         return true;
       }
       case 'u': {
         Advance();
         size_t value;
         if (unicode_) {
             if (current() == '{') {
@@ -522,20 +567,24 @@ RegExpParser<CharT>::ParseClassCharacter
             return true;
         }
         // If \u is not followed by a four-digit or braced hexadecimal, treat it
         // as an identity escape.
         *code = 'u';
         return true;
       }
       default: {
-        // Extended identity escape. We accept any character that hasn't
-        // been matched by a more specific case, not just the subset required
-        // by the ECMAScript specification.
+        // Extended identity escape (non-unicode only). We accept any character
+        // that hasn't been matched by a more specific case, not just the subset
+        // required by the ECMAScript specification.
         widechar result = current();
+        if (unicode_ && result != '-' && !IsSyntaxCharacter(result)) {
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
         Advance();
         *code = result;
         return true;
       }
     }
     return true;
 }
 
@@ -1383,16 +1432,18 @@ RegExpParser<CharT>::ParseDisjunction()
                            group_type == NEGATIVE_LOOKAHEAD);
                 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
                 body = alloc->newInfallible<RegExpLookahead>(body,
                                                    is_positive,
                                                    end_capture_index - capture_index,
                                                    capture_index);
             }
             builder->AddAtom(body);
+            if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD))
+                continue;
             // For compatability with JSC and ES3, we allow quantifiers after
             // lookaheads, and break in all cases.
             break;
           }
           case '|': {
             Advance();
             builder->NewAlternative();
             continue;
@@ -1522,26 +1573,36 @@ RegExpParser<CharT>::ParseDisjunction()
                     if (capture == nullptr) {
                         builder->AddEmpty();
                         break;
                     }
                     RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture);
                     builder->AddAtom(atom);
                     break;
                 }
+                if (unicode_)
+                    return ReportError(JSMSG_BACK_REF_OUT_OF_RANGE);
                 widechar first_digit = Next();
                 if (first_digit == '8' || first_digit == '9') {
                     // Treat as identity escape
                     builder->AddCharacter(first_digit);
                     Advance(2);
                     break;
                 }
               }
                 // FALLTHROUGH
               case '0': {
+                if (unicode_) {
+                    Advance(2);
+                    if (IsDecimalDigit(current()))
+                        return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE);
+                    builder->AddCharacter(0);
+                    break;
+                }
+
                 Advance();
                 size_t octal = ParseOctalLiteral();
                 builder->AddCharacter(octal);
                 break;
               }
                 // ControlEscape :: one of
                 //   f n r t v
               case 'f':
@@ -1566,16 +1627,18 @@ RegExpParser<CharT>::ParseDisjunction()
                 break;
               case 'c': {
                 Advance();
                 widechar controlLetter = Next();
                 // Special case if it is an ASCII letter.
                 // Convert lower case letters to uppercase.
                 widechar letter = controlLetter & ~('a' ^ 'A');
                 if (letter < 'A' || 'Z' < letter) {
+                    if (unicode_)
+                        return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
                     // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
                     // This is outside the specification. We match JSC in
                     // reading the backslash as a literal character instead
                     // of as starting an escape.
                     builder->AddCharacter('\\');
                 } else {
                     Advance(2);
                     builder->AddCharacter(controlLetter & 0x1f);
@@ -1583,16 +1646,18 @@ RegExpParser<CharT>::ParseDisjunction()
                 break;
               }
               case 'x': {
                 Advance(2);
                 size_t value;
                 if (ParseHexEscape(2, &value)) {
                     builder->AddCharacter(value);
                 } else {
+                    if (unicode_)
+                        return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
                     builder->AddCharacter('x');
                 }
                 break;
               }
               case 'u': {
                 Advance(2);
                 size_t value;
                 if (unicode_) {
@@ -1634,38 +1699,46 @@ RegExpParser<CharT>::ParseDisjunction()
                     builder->AddCharacter(value);
                 } else {
                     builder->AddCharacter('u');
                 }
                 break;
               }
               default:
                 // Identity escape.
+                if (unicode_ && !IsSyntaxCharacter(Next()))
+                    return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
                 builder->AddCharacter(Next());
                 Advance(2);
                 break;
             }
             break;
           case '{': {
+            if (unicode_)
+                return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
             int dummy;
             if (ParseIntervalQuantifier(&dummy, &dummy))
                 return ReportError(JSMSG_NOTHING_TO_REPEAT);
             // fallthrough
           }
           default:
             if (unicode_) {
                 char16_t lead, trail;
                 if (ParseRawSurrogatePair(&lead, &trail)) {
                     builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_));
                 } else {
                     widechar c = current();
                     if (unicode::IsLeadSurrogate(c))
                         builder->AddAtom(LeadSurrogateAtom(alloc, c));
                     else if (unicode::IsTrailSurrogate(c))
                         builder->AddAtom(TrailSurrogateAtom(alloc, c));
+                    else if (c == ']')
+                        return ReportError(JSMSG_RAW_BRACKET_IN_REGEP);
+                    else if (c == '}')
+                        return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
                     else
                         builder->AddCharacter(c);
                     Advance();
                 }
                 break;
             }
             builder->AddCharacter(current());
             Advance();
--- a/js/src/js.msg
+++ b/js/src/js.msg
@@ -438,27 +438,31 @@ MSG_DEF(JSMSG_INVALID_DIGITS_VALUE,    1
 MSG_DEF(JSMSG_INVALID_LANGUAGE_TAG,    1, JSEXN_RANGEERR, "invalid language tag: {0}")
 MSG_DEF(JSMSG_INVALID_LOCALES_ELEMENT, 0, JSEXN_TYPEERR, "invalid element in locales argument")
 MSG_DEF(JSMSG_INVALID_LOCALE_MATCHER,  1, JSEXN_RANGEERR, "invalid locale matcher in supportedLocalesOf(): {0}")
 MSG_DEF(JSMSG_INVALID_OPTION_VALUE,    2, JSEXN_RANGEERR, "invalid value {1} for option {0}")
 MSG_DEF(JSMSG_INVALID_TIME_ZONE,       1, JSEXN_RANGEERR, "invalid time zone in DateTimeFormat(): {0}")
 MSG_DEF(JSMSG_UNDEFINED_CURRENCY,      0, JSEXN_TYPEERR, "undefined currency in NumberFormat() with currency style")
 
 // RegExp
+MSG_DEF(JSMSG_BACK_REF_OUT_OF_RANGE,   0, JSEXN_SYNTAXERR, "back reference out of range in regular expression")
 MSG_DEF(JSMSG_BAD_CLASS_RANGE,         0, JSEXN_SYNTAXERR, "invalid range in character class")
 MSG_DEF(JSMSG_DEPRECATED_REGEXP_MULTILINE, 0, JSEXN_SYNTAXERR, "RegExp.multiline is deprecated. Use m flag instead")
 MSG_DEF(JSMSG_ESCAPE_AT_END_OF_REGEXP, 0, JSEXN_SYNTAXERR, "\\ at end of pattern")
+MSG_DEF(JSMSG_INVALID_DECIMAL_ESCAPE, 0, JSEXN_SYNTAXERR, "invalid decimal escape in regular expression")
 MSG_DEF(JSMSG_INVALID_GROUP,           0, JSEXN_SYNTAXERR, "invalid regexp group")
 MSG_DEF(JSMSG_INVALID_IDENTITY_ESCAPE, 0, JSEXN_SYNTAXERR, "invalid identity escape in regular expression")
 MSG_DEF(JSMSG_INVALID_UNICODE_ESCAPE,  0, JSEXN_SYNTAXERR, "invalid unicode escape in regular expression")
 MSG_DEF(JSMSG_MISSING_PAREN,           0, JSEXN_SYNTAXERR, "unterminated parenthetical")
 MSG_DEF(JSMSG_NEWREGEXP_FLAGGED,       0, JSEXN_TYPEERR, "can't supply flags when constructing one RegExp from another")
 MSG_DEF(JSMSG_NOTHING_TO_REPEAT,       0, JSEXN_SYNTAXERR, "nothing to repeat")
 MSG_DEF(JSMSG_NUMBERS_OUT_OF_ORDER,    0, JSEXN_SYNTAXERR, "numbers out of order in {} quantifier.")
 MSG_DEF(JSMSG_RANGE_WITH_CLASS_ESCAPE, 0, JSEXN_SYNTAXERR, "character class escape cannot be used in class range in regular expression")
+MSG_DEF(JSMSG_RAW_BRACE_IN_REGEP,      0, JSEXN_SYNTAXERR, "raw brace is not allowed in regular expression with unicode flag")
+MSG_DEF(JSMSG_RAW_BRACKET_IN_REGEP,    0, JSEXN_SYNTAXERR, "raw bracket is not allowed in regular expression with unicode flag")
 MSG_DEF(JSMSG_TOO_MANY_PARENS,         0, JSEXN_INTERNALERR, "too many parentheses in regular expression")
 MSG_DEF(JSMSG_UNICODE_OVERFLOW,        0, JSEXN_SYNTAXERR, "unicode codepoint should not be greater than 0x10FFFF in regular expression")
 MSG_DEF(JSMSG_UNMATCHED_RIGHT_PAREN,   0, JSEXN_SYNTAXERR, "unmatched ) in regular expression")
 MSG_DEF(JSMSG_UNTERM_CLASS,            0, JSEXN_SYNTAXERR, "unterminated character class")
 
 // Self-hosting
 MSG_DEF(JSMSG_DEFAULT_LOCALE_ERROR,    0, JSEXN_ERR, "internal error getting the default locale")
 MSG_DEF(JSMSG_NO_SUCH_SELF_HOSTED_PROP,1, JSEXN_ERR, "No such property on self-hosted object: {0}")
new file mode 100644
--- /dev/null
+++ b/js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js
@@ -0,0 +1,117 @@
+var BUGNUMBER = 1135377;
+var summary = "Implement RegExp unicode flag -- disallow extended patterns.";
+
+print(BUGNUMBER + ": " + summary);
+
+// IdentityEscape
+
+assertEqArray(/\^\$\\\.\*\+\?\(\)\[\]\{\}\|/u.exec("^$\\.*+?()[]{}|"),
+              ["^$\\.*+?()[]{}|"]);
+assertThrowsInstanceOf(() => eval(`/\\A/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\-/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\U{10}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\U0000/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\uD83D\\U0000/u`), SyntaxError);
+
+assertEqArray(/[\^\$\\\.\*\+\?\(\)\[\]\{\}\|]+/u.exec("^$\\.*+?()[]{}|"),
+              ["^$\\.*+?()[]{}|"]);
+assertThrowsInstanceOf(() => eval(`/[\\A]/u`), SyntaxError);
+assertEqArray(/[A\-Z]+/u.exec("a-zABC"),
+              ["-"]);
+assertThrowsInstanceOf(() => eval(`/[\\U{10}]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/[\\U0000]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/[\\uD83D\\U0000]/u`), SyntaxError);
+
+// PatternCharacter
+assertThrowsInstanceOf(() => eval(`/{}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/{/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/{0}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/{1,}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/{1,2}/u`), SyntaxError);
+
+// QuantifiableAssertion
+assertEqArray(/.B(?=A)/u.exec("cBaCBA"),
+              ["CB"]);
+assertEqArray(/.B(?!A)/u.exec("CBAcBa"),
+              ["cB"]);
+assertEqArray(/.B(?:A)/u.exec("cBaCBA"),
+              ["CBA"]);
+assertEqArray(/.B(A)/u.exec("cBaCBA"),
+              ["CBA", "A"]);
+
+assertThrowsInstanceOf(() => eval(`/.B(?=A)+/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/.B(?!A)+/u`), SyntaxError);
+assertEqArray(/.B(?:A)+/u.exec("cBaCBA"),
+              ["CBA"]);
+assertEqArray(/.B(A)+/u.exec("cBaCBA"),
+              ["CBA", "A"]);
+
+// ControlLetter
+assertEqArray(/\cA/u.exec("\u0001"),
+              ["\u0001"]);
+assertEqArray(/\cZ/u.exec("\u001a"),
+              ["\u001a"]);
+assertEqArray(/\ca/u.exec("\u0001"),
+              ["\u0001"]);
+assertEqArray(/\cz/u.exec("\u001a"),
+              ["\u001a"]);
+
+assertEqArray(/[\cA]/u.exec("\u0001"),
+              ["\u0001"]);
+assertEqArray(/[\cZ]/u.exec("\u001a"),
+              ["\u001a"]);
+assertEqArray(/[\ca]/u.exec("\u0001"),
+              ["\u0001"]);
+assertEqArray(/[\cz]/u.exec("\u001a"),
+              ["\u001a"]);
+
+assertThrowsInstanceOf(() => eval(`/\\c/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\c1/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\c_/u`), SyntaxError);
+
+assertThrowsInstanceOf(() => eval(`/[\\c]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/[\\c1]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/[\\c_]/u`), SyntaxError);
+
+// HexEscapeSequence
+assertThrowsInstanceOf(() => eval(`/\\x/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\x0/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\x1/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\x1G/u`), SyntaxError);
+
+// LegacyOctalEscapeSequence
+assertThrowsInstanceOf(() => eval(`/\\52/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\052/u`), SyntaxError);
+
+// DecimalEscape
+assertEqArray(/\0/u.exec("\0"),
+              ["\0"]);
+assertEqArray(/[\0]/u.exec("\0"),
+              ["\0"]);
+assertEqArray(/\0A/u.exec("\0A"),
+              ["\0A"]);
+assertEqArray(/\0G/u.exec("\0G"),
+              ["\0G"]);
+assertEqArray(/(A.)\1/u.exec("ABACABAB"),
+              ["ABAB", "AB"]);
+assertEqArray(/(A.)(B.)(C.)(D.)(E.)(F.)(G.)(H.)(I.)(J.)(K.)\10/u.exec("A1B2C3D4E5F6G7H8I9JaKbJa"),
+              ["A1B2C3D4E5F6G7H8I9JaKbJa", "A1", "B2", "C3", "D4", "E5", "F6", "G7", "H8", "I9", "Ja", "Kb"]);
+
+assertThrowsInstanceOf(() => eval(`/\\00/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\01/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\09/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\1/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\2/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\3/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\4/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\5/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\6/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\7/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\8/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\9/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\10/u`), SyntaxError);
+
+if (typeof reportCompare === "function")
+    reportCompare(true, true);
--- a/js/src/vm/Xdr.h
+++ b/js/src/vm/Xdr.h
@@ -24,21 +24,21 @@ namespace js {
  * versions.  If deserialization fails, the data should be invalidated if
  * possible.
  *
  * When you change this, run make_opcode_doc.py and copy the new output into
  * this wiki page:
  *
  *  https://developer.mozilla.org/en-US/docs/SpiderMonkey/Internals/Bytecode
  */
-static const uint32_t XDR_BYTECODE_VERSION_SUBTRAHEND = 336;
+static const uint32_t XDR_BYTECODE_VERSION_SUBTRAHEND = 337;
 static const uint32_t XDR_BYTECODE_VERSION =
     uint32_t(0xb973c0de - XDR_BYTECODE_VERSION_SUBTRAHEND);
 
-static_assert(JSErr_Limit == 425,
+static_assert(JSErr_Limit == 429,
               "GREETINGS, POTENTIAL SUBTRAHEND INCREMENTER! If you added or "
               "removed MSG_DEFs from js.msg, you should increment "
               "XDR_BYTECODE_VERSION_SUBTRAHEND and update this assertion's "
               "expected JSErr_Limit value.");
 
 class XDRBuffer {
   public:
     explicit XDRBuffer(JSContext* cx)