Bug 576837: Fix YARR character-class range parsing. (r=jwalden)
☠☠ backed out by f4444a398ec1 ☠ ☠
authorChris Leary <cdleary@mozilla.com>
Mon, 08 Nov 2010 18:08:32 -0800
changeset 57745 8ae5fce0f19b273d3b5248f5df4e177c76e2209e
parent 57744 ebffb2a709e5e07291fad9bcdf2f51b1f1a8d506
child 57746 f4444a398ec19777abce40eede9b1d12ca389f48
push id17032
push userrsayre@mozilla.com
push dateWed, 17 Nov 2010 21:55:39 +0000
treeherdermozilla-central@78a42f77bb90 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjwalden
bugs576837
milestone2.0b8pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 576837: Fix YARR character-class range parsing. (r=jwalden)
js/src/jit-test/tests/basic/bug576837-regexp.js
js/src/jsregexp.cpp
js/src/tests/ecma_3/RegExp/jstests.list
js/src/tests/ecma_3/RegExp/regress-375715-02.js
js/src/yarr/yarr/RegexParser.h
new file mode 100644
--- /dev/null
+++ b/js/src/jit-test/tests/basic/bug576837-regexp.js
@@ -0,0 +1,43 @@
+// Check that builtin character classes within ranges produce syntax
+// errors.
+
+function isRegExpSyntaxError(pattern) {
+    try {
+        var re = new RegExp(pattern);
+    } catch (e) {
+        if (e instanceof SyntaxError)
+            return true;
+    }
+    return false;
+}
+
+function testRangeSyntax(end1, end2, shouldFail) {
+    var makePattern = function(e1, e2) {
+        var pattern = '[' + e1 + '-' + e2 + ']';
+        print(uneval(pattern));
+        return pattern;
+    };
+    assertEq(isRegExpSyntaxError(makePattern(end1, end2)), shouldFail);
+    assertEq(isRegExpSyntaxError(makePattern(end2, end1)), shouldFail);
+}
+
+function checkRangeValid(end1, end2) {
+    testRangeSyntax(end1, end2, false);
+}
+
+function checkRangeInvalid(end1, end2) {
+    testRangeSyntax(end1, end2, true);
+}
+
+checkRangeInvalid('C', '\\s');
+checkRangeInvalid('C', '\\d');
+checkRangeInvalid('C', '\\W');
+checkRangeInvalid('C', '\\W');
+checkRangeValid('C', '');
+checkRangeValid('C', 'C');
+checkRangeInvalid('\\s', '\\s');
+checkRangeInvalid('\\W', '\\s');
+checkRangeValid('\\b', '\\b');
+checkRangeValid('\\B', '\\B');
+checkRangeInvalid('\\w', '\\B');
+checkRangeInvalid('\\w', '\\b');
--- a/js/src/jsregexp.cpp
+++ b/js/src/jsregexp.cpp
@@ -53,16 +53,18 @@
 #include "jsobj.h"
 #include "jsregexp.h"
 #include "jsstr.h"
 #include "jsvector.h"
 
 #include "jsobjinlines.h"
 #include "jsregexpinlines.h"
 
+#include "yarr/RegexParser.h"
+
 #ifdef JS_TRACER
 #include "jstracer.h"
 using namespace avmplus;
 using namespace nanojit;
 #endif
 
 using namespace js;
 using namespace js::gc;
@@ -166,46 +168,33 @@ js_ObjectIsRegExp(JSObject *obj)
 
 /*
  * js::RegExp
  */
 
 void
 RegExp::handleYarrError(JSContext *cx, int error)
 {
-    /* Hack: duplicated from yarr/yarr/RegexParser.h */
-    enum ErrorCode {
-        NoError,
-        PatternTooLarge,
-        QuantifierOutOfOrder,
-        QuantifierWithoutAtom,
-        MissingParentheses,
-        ParenthesesUnmatched,
-        ParenthesesTypeInvalid,     /* "(?" with bad next char or end of pattern. */
-        CharacterClassUnmatched,
-        CharacterClassOutOfOrder,
-        QuantifierTooLarge,
-        EscapeUnterminated
-    };
     switch (error) {
-      case NoError:
+      case JSC::Yarr::NoError:
         JS_NOT_REACHED("Precondition violation: an error must have occurred.");
         return;
 #define COMPILE_EMSG(__code, __msg) \
-      case __code: \
+      case JSC::Yarr::__code: \
         JS_ReportErrorFlagsAndNumberUC(cx, JSREPORT_ERROR, js_GetErrorMessage, NULL, __msg); \
         return
       COMPILE_EMSG(PatternTooLarge, JSMSG_REGEXP_TOO_COMPLEX);
       COMPILE_EMSG(QuantifierOutOfOrder, JSMSG_BAD_QUANTIFIER);
       COMPILE_EMSG(QuantifierWithoutAtom, JSMSG_BAD_QUANTIFIER);
       COMPILE_EMSG(MissingParentheses, JSMSG_MISSING_PAREN);
       COMPILE_EMSG(ParenthesesUnmatched, JSMSG_UNMATCHED_RIGHT_PAREN);
-      COMPILE_EMSG(ParenthesesTypeInvalid, JSMSG_BAD_QUANTIFIER);
+      COMPILE_EMSG(ParenthesesTypeInvalid, JSMSG_BAD_QUANTIFIER); /* "(?" with bad next char */
       COMPILE_EMSG(CharacterClassUnmatched, JSMSG_BAD_CLASS_RANGE);
       COMPILE_EMSG(CharacterClassOutOfOrder, JSMSG_BAD_CLASS_RANGE);
+      COMPILE_EMSG(CharacterClassRangeSingleChar, JSMSG_BAD_CLASS_RANGE);
       COMPILE_EMSG(EscapeUnterminated, JSMSG_TRAILING_SLASH);
       COMPILE_EMSG(QuantifierTooLarge, JSMSG_BAD_QUANTIFIER);
 #undef COMPILE_EMSG
       default:
         JS_NOT_REACHED("Precondition violation: unknown Yarr error code.");
     }
 }
 
--- a/js/src/tests/ecma_3/RegExp/jstests.list
+++ b/js/src/tests/ecma_3/RegExp/jstests.list
@@ -48,17 +48,16 @@ script regress-31316.js
 skip script regress-330684.js # slow
 script regress-334158.js
 script regress-346090.js
 script regress-367888.js
 script regress-375642.js
 script regress-375651.js
 script regress-375711.js
 script regress-375715-01-n.js
-script regress-375715-02.js
 script regress-375715-03.js
 script regress-375715-04.js
 script regress-436700.js
 script regress-465862.js
 script regress-57572.js
 script regress-57631.js
 script regress-67773.js
 script regress-72964.js
deleted file mode 100644
--- a/js/src/tests/ecma_3/RegExp/regress-375715-02.js
+++ /dev/null
@@ -1,59 +0,0 @@
-/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* ***** BEGIN LICENSE BLOCK *****
- * Version: MPL 1.1/GPL 2.0/LGPL 2.1
- *
- * The contents of this file are subject to the Mozilla Public License Version
- * 1.1 (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * The Original Code is JavaScript Engine testing utilities.
- *
- * The Initial Developer of the Original Code is
- * Mozilla Foundation.
- * Portions created by the Initial Developer are Copyright (C) 2007
- * the Initial Developer. All Rights Reserved.
- *
- * Contributor(s): Jesse Ruderman
- *
- * Alternatively, the contents of this file may be used under the terms of
- * either the GNU General Public License Version 2 or later (the "GPL"), or
- * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- * in which case the provisions of the GPL or the LGPL are applicable instead
- * of those above. If you wish to allow use of your version of this file only
- * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the MPL, indicate your
- * decision by deleting the provisions above and replace them with the notice
- * and other provisions required by the GPL or the LGPL. If you do not delete
- * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the MPL, the GPL or the LGPL.
- *
- * ***** END LICENSE BLOCK ***** */
-
-//-----------------------------------------------------------------------------
-var BUGNUMBER = 375715;
-var summary = 'Do not assert: (c2 <= cs->length) && (c1 <= c2)';
-var actual = '';
-var expect = '';
-
-
-//-----------------------------------------------------------------------------
-test();
-//-----------------------------------------------------------------------------
-
-function test()
-{
-  enterFunc ('test');
-  printBugNumber(BUGNUMBER);
-  printStatus (summary);
- 
-  /[\s-:]/;
-  reportCompare(expect, actual, summary + '/[\s-:]/');
-
-  exitFunc ('test');
-}
--- a/js/src/yarr/yarr/RegexParser.h
+++ b/js/src/yarr/yarr/RegexParser.h
@@ -34,53 +34,55 @@ namespace JSC { namespace Yarr {
 
 enum BuiltInCharacterClassID {
     DigitClassID,
     SpaceClassID,
     WordClassID,
     NewlineClassID
 };
 
+enum ErrorCode {
+    NoError,
+    PatternTooLarge,
+    QuantifierOutOfOrder,
+    QuantifierWithoutAtom,
+    MissingParentheses,
+    ParenthesesUnmatched,
+    ParenthesesTypeInvalid,
+    CharacterClassUnmatched,
+    CharacterClassOutOfOrder,
+    CharacterClassRangeSingleChar,
+    EscapeUnterminated,
+    QuantifierTooLarge,
+    NumberOfErrorCodes
+};
+
 // The Parser class should not be used directly - only via the Yarr::parse() method.
 template<class Delegate>
 class Parser {
 private:
     template<class FriendDelegate>
     friend int parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
 
-    enum ErrorCode {
-        NoError,
-        PatternTooLarge,
-        QuantifierOutOfOrder,
-        QuantifierWithoutAtom,
-        MissingParentheses,
-        ParenthesesUnmatched,
-        ParenthesesTypeInvalid,
-        CharacterClassUnmatched,
-        CharacterClassOutOfOrder,
-        EscapeUnterminated,
-        QuantifierTooLarge,
-        NumberOfErrorCodes
-    };
-
     /*
      * CharacterClassParserDelegate:
      *
      * The class CharacterClassParserDelegate is used in the parsing of character
      * classes.  This class handles detection of character ranges.  This class
      * implements enough of the delegate interface such that it can be passed to
      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
      * to perform the parsing of escape characters in character sets.
      */
     class CharacterClassParserDelegate {
     public:
         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
             : m_delegate(delegate)
             , m_err(err)
             , m_state(empty)
+            , m_sawCharacterClass(false)
         {
         }
 
         /*
          * begin():
          *
          * Called at beginning of construction.
          */
@@ -102,16 +104,24 @@ private:
         {
             switch (m_state) {
             case empty:
                 m_character = ch;
                 m_state = cachedCharacter;
                 break;
 
             case cachedCharacter:
+                // This guard handles things like /[\s-d]/.
+                if ((m_character == '-') && m_sawCharacterClass) {
+                    m_err = CharacterClassRangeSingleChar;
+                    m_state = empty;
+                    break;
+                }
+
+                m_sawCharacterClass = false;
                 if (ch == '-')
                     m_state = cachedCharacterHyphen;
                 else {
                     m_delegate.atomCharacterClassAtom(m_character);
                     m_character = ch;
                 }
                 break;
 
@@ -133,27 +143,40 @@ private:
         void atomPatternCharacter(UChar ch)
         {
             // Flush if a character is already pending to prevent the
             // hyphen from begin interpreted as indicating a range.
             if((ch == '-') && (m_state == cachedCharacter))
                 flush();
 
             atomPatternCharacterUnescaped(ch);
+            m_sawCharacterClass = false;
         }
 
         /*
          * atomBuiltInCharacterClass():
          *
          * Adds a built-in character class, called by parseEscape().
          */
         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
         {
+            // The first part of this guard handles things like /[a-\d]/ and the
+            // second part handles things like /[\w-\s]/.
+            if (m_state == cachedCharacterHyphen ||
+                (m_sawCharacterClass && (m_state == cachedCharacter) && m_character == '-')) {
+                // If the RHS of a range does not contain exacly one character then a SyntaxError
+                // must be thrown.
+                // Assumes none of the built in character classes contain a single character.
+                m_err = CharacterClassRangeSingleChar;
+                m_state = empty;
+                return;
+            }
             flush();
             m_delegate.atomCharacterClassBuiltIn(classID, invert);
+            m_sawCharacterClass = true;
         }
 
         /*
          * end():
          *
          * Called at end of construction.
          */
         void end()
@@ -179,16 +202,18 @@ private:
     
         Delegate& m_delegate;
         ErrorCode& m_err;
         enum CharacterClassConstructionState {
             empty,
             cachedCharacter,
             cachedCharacterHyphen
         } m_state;
+        // Used to verify that there is not a character class on the LHS of a range.
+        bool m_sawCharacterClass;
         UChar m_character;
     };
 
     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
         : m_delegate(delegate)
         , m_backReferenceLimit(backReferenceLimit)
         , m_err(NoError)
         , m_data(const_cast<UString &>(pattern).chars())
@@ -399,17 +424,17 @@ private:
     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
     {
         parseEscape<true>(delegate);
     }
 
     /*
      * parseCharacterClass():
      *
-     * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
+     * Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape)
      * to an instance of CharacterClassParserDelegate, to describe the character class to the
      * delegate.
      */
     void parseCharacterClass()
     {
         JS_ASSERT(!m_err);
         JS_ASSERT(peek() == '[');
         consume();