Bug 576837: Fix YARR character-class range parsing. (r=jwalden)
☠☠ backed out by f4444a398ec1 ☠ ☠
authorChris Leary <cdleary@mozilla.com>
Mon, 08 Nov 2010 18:08:32 -0800
changeset 57745 8ae5fce0f19b273d3b5248f5df4e177c76e2209e
parent 57744 ebffb2a709e5e07291fad9bcdf2f51b1f1a8d506
child 57746 f4444a398ec19777abce40eede9b1d12ca389f48
push id1
push usershaver@mozilla.com
push dateTue, 04 Jan 2011 17:58:04 +0000
reviewersjwalden
bugs576837
milestone2.0b8pre
Bug 576837: Fix YARR character-class range parsing. (r=jwalden)
js/src/jit-test/tests/basic/bug576837-regexp.js
js/src/jsregexp.cpp
js/src/tests/ecma_3/RegExp/jstests.list
js/src/tests/ecma_3/RegExp/regress-375715-02.js
js/src/yarr/yarr/RegexParser.h
new file mode 100644
--- /dev/null
+++ b/js/src/jit-test/tests/basic/bug576837-regexp.js
@@ -0,0 +1,43 @@
+// Check that builtin character classes within ranges produce syntax
+// errors.
+
+function isRegExpSyntaxError(pattern) {
+    try {
+        var re = new RegExp(pattern);
+    } catch (e) {
+        if (e instanceof SyntaxError)
+            return true;
+    }
+    return false;
+}
+
+function testRangeSyntax(end1, end2, shouldFail) {
+    var makePattern = function(e1, e2) {
+        var pattern = '[' + e1 + '-' + e2 + ']';
+        print(uneval(pattern));
+        return pattern;
+    };
+    assertEq(isRegExpSyntaxError(makePattern(end1, end2)), shouldFail);
+    assertEq(isRegExpSyntaxError(makePattern(end2, end1)), shouldFail);
+}
+
+function checkRangeValid(end1, end2) {
+    testRangeSyntax(end1, end2, false);
+}
+
+function checkRangeInvalid(end1, end2) {
+    testRangeSyntax(end1, end2, true);
+}
+
+checkRangeInvalid('C', '\\s');
+checkRangeInvalid('C', '\\d');
+checkRangeInvalid('C', '\\W');
+checkRangeInvalid('C', '\\W');
+checkRangeValid('C', '');
+checkRangeValid('C', 'C');
+checkRangeInvalid('\\s', '\\s');
+checkRangeInvalid('\\W', '\\s');
+checkRangeValid('\\b', '\\b');
+checkRangeValid('\\B', '\\B');
+checkRangeInvalid('\\w', '\\B');
+checkRangeInvalid('\\w', '\\b');
--- a/js/src/jsregexp.cpp
+++ b/js/src/jsregexp.cpp
@@ -53,16 +53,18 @@
 #include "jsobj.h"
 #include "jsregexp.h"
 #include "jsstr.h"
 #include "jsvector.h"
 
 #include "jsobjinlines.h"
 #include "jsregexpinlines.h"
 
+#include "yarr/RegexParser.h"
+
 #ifdef JS_TRACER
 #include "jstracer.h"
 using namespace avmplus;
 using namespace nanojit;
 #endif
 
 using namespace js;
 using namespace js::gc;
@@ -166,46 +168,33 @@ js_ObjectIsRegExp(JSObject *obj)
 
 /*
  * js::RegExp
  */
 
 void
 RegExp::handleYarrError(JSContext *cx, int error)
 {
-    /* Hack: duplicated from yarr/yarr/RegexParser.h */
-    enum ErrorCode {
-        NoError,
-        PatternTooLarge,
-        QuantifierOutOfOrder,
-        QuantifierWithoutAtom,
-        MissingParentheses,
-        ParenthesesUnmatched,
-        ParenthesesTypeInvalid,     /* "(?" with bad next char or end of pattern. */
-        CharacterClassUnmatched,
-        CharacterClassOutOfOrder,
-        QuantifierTooLarge,
-        EscapeUnterminated
-    };
     switch (error) {
-      case NoError:
+      case JSC::Yarr::NoError:
         JS_NOT_REACHED("Precondition violation: an error must have occurred.");
         return;
 #define COMPILE_EMSG(__code, __msg) \
-      case __code: \
+      case JSC::Yarr::__code: \
         JS_ReportErrorFlagsAndNumberUC(cx, JSREPORT_ERROR, js_GetErrorMessage, NULL, __msg); \
         return
       COMPILE_EMSG(PatternTooLarge, JSMSG_REGEXP_TOO_COMPLEX);
       COMPILE_EMSG(QuantifierOutOfOrder, JSMSG_BAD_QUANTIFIER);
       COMPILE_EMSG(QuantifierWithoutAtom, JSMSG_BAD_QUANTIFIER);
       COMPILE_EMSG(MissingParentheses, JSMSG_MISSING_PAREN);
       COMPILE_EMSG(ParenthesesUnmatched, JSMSG_UNMATCHED_RIGHT_PAREN);
-      COMPILE_EMSG(ParenthesesTypeInvalid, JSMSG_BAD_QUANTIFIER);
+      COMPILE_EMSG(ParenthesesTypeInvalid, JSMSG_BAD_QUANTIFIER); /* "(?" with bad next char */
       COMPILE_EMSG(CharacterClassUnmatched, JSMSG_BAD_CLASS_RANGE);
       COMPILE_EMSG(CharacterClassOutOfOrder, JSMSG_BAD_CLASS_RANGE);
+      COMPILE_EMSG(CharacterClassRangeSingleChar, JSMSG_BAD_CLASS_RANGE);
       COMPILE_EMSG(EscapeUnterminated, JSMSG_TRAILING_SLASH);
       COMPILE_EMSG(QuantifierTooLarge, JSMSG_BAD_QUANTIFIER);
 #undef COMPILE_EMSG
       default:
         JS_NOT_REACHED("Precondition violation: unknown Yarr error code.");
     }
 }
 
--- a/js/src/tests/ecma_3/RegExp/jstests.list
+++ b/js/src/tests/ecma_3/RegExp/jstests.list
@@ -48,17 +48,16 @@ script regress-31316.js
 skip script regress-330684.js # slow
 script regress-334158.js
 script regress-346090.js
 script regress-367888.js
 script regress-375642.js
 script regress-375651.js
 script regress-375711.js
 script regress-375715-01-n.js
-script regress-375715-02.js
 script regress-375715-03.js
 script regress-375715-04.js
 script regress-436700.js
 script regress-465862.js
 script regress-57572.js
 script regress-57631.js
 script regress-67773.js
 script regress-72964.js
deleted file mode 100644
--- a/js/src/tests/ecma_3/RegExp/regress-375715-02.js
+++ /dev/null
@@ -1,59 +0,0 @@
-/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* ***** BEGIN LICENSE BLOCK *****
- * Version: MPL 1.1/GPL 2.0/LGPL 2.1
- *
- * The contents of this file are subject to the Mozilla Public License Version
- * 1.1 (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * The Original Code is JavaScript Engine testing utilities.
- *
- * The Initial Developer of the Original Code is
- * Mozilla Foundation.
- * Portions created by the Initial Developer are Copyright (C) 2007
- * the Initial Developer. All Rights Reserved.
- *
- * Contributor(s): Jesse Ruderman
- *
- * Alternatively, the contents of this file may be used under the terms of
- * either the GNU General Public License Version 2 or later (the "GPL"), or
- * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- * in which case the provisions of the GPL or the LGPL are applicable instead
- * of those above. If you wish to allow use of your version of this file only
- * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the MPL, indicate your
- * decision by deleting the provisions above and replace them with the notice
- * and other provisions required by the GPL or the LGPL. If you do not delete
- * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the MPL, the GPL or the LGPL.
- *
- * ***** END LICENSE BLOCK ***** */
-
-//-----------------------------------------------------------------------------
-var BUGNUMBER = 375715;
-var summary = 'Do not assert: (c2 <= cs->length) && (c1 <= c2)';
-var actual = '';
-var expect = '';
-
-
-//-----------------------------------------------------------------------------
-test();
-//-----------------------------------------------------------------------------
-
-function test()
-{
-  enterFunc ('test');
-  printBugNumber(BUGNUMBER);
-  printStatus (summary);
- 
-  /[\s-:]/;
-  reportCompare(expect, actual, summary + '/[\s-:]/');
-
-  exitFunc ('test');
-}
--- a/js/src/yarr/yarr/RegexParser.h
+++ b/js/src/yarr/yarr/RegexParser.h
@@ -34,53 +34,55 @@ namespace JSC { namespace Yarr {
 
 enum BuiltInCharacterClassID {
     DigitClassID,
     SpaceClassID,
     WordClassID,
     NewlineClassID
 };
 
+enum ErrorCode {
+    NoError,
+    PatternTooLarge,
+    QuantifierOutOfOrder,
+    QuantifierWithoutAtom,
+    MissingParentheses,
+    ParenthesesUnmatched,
+    ParenthesesTypeInvalid,
+    CharacterClassUnmatched,
+    CharacterClassOutOfOrder,
+    CharacterClassRangeSingleChar,
+    EscapeUnterminated,
+    QuantifierTooLarge,
+    NumberOfErrorCodes
+};
+
 // The Parser class should not be used directly - only via the Yarr::parse() method.
 template<class Delegate>
 class Parser {
 private:
     template<class FriendDelegate>
     friend int parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
 
-    enum ErrorCode {
-        NoError,
-        PatternTooLarge,
-        QuantifierOutOfOrder,
-        QuantifierWithoutAtom,
-        MissingParentheses,
-        ParenthesesUnmatched,
-        ParenthesesTypeInvalid,
-        CharacterClassUnmatched,
-        CharacterClassOutOfOrder,
-        EscapeUnterminated,
-        QuantifierTooLarge,
-        NumberOfErrorCodes
-    };
-
     /*
      * CharacterClassParserDelegate:
      *
      * The class CharacterClassParserDelegate is used in the parsing of character
      * classes.  This class handles detection of character ranges.  This class
      * implements enough of the delegate interface such that it can be passed to
      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
      * to perform the parsing of escape characters in character sets.
      */
     class CharacterClassParserDelegate {
     public:
         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
             : m_delegate(delegate)
             , m_err(err)
             , m_state(empty)
+            , m_sawCharacterClass(false)
         {
         }
 
         /*
          * begin():
          *
          * Called at beginning of construction.
          */
@@ -102,16 +104,24 @@ private:
         {
             switch (m_state) {
             case empty:
                 m_character = ch;
                 m_state = cachedCharacter;
                 break;
 
             case cachedCharacter:
+                // This guard handles things like /[\s-d]/.
+                if ((m_character == '-') && m_sawCharacterClass) {
+                    m_err = CharacterClassRangeSingleChar;
+                    m_state = empty;
+                    break;
+                }
+
+                m_sawCharacterClass = false;
                 if (ch == '-')
                     m_state = cachedCharacterHyphen;
                 else {
                     m_delegate.atomCharacterClassAtom(m_character);
                     m_character = ch;
                 }
                 break;
 
@@ -133,27 +143,40 @@ private:
         void atomPatternCharacter(UChar ch)
         {
             // Flush if a character is already pending to prevent the
             // hyphen from begin interpreted as indicating a range.
             if((ch == '-') && (m_state == cachedCharacter))
                 flush();
 
             atomPatternCharacterUnescaped(ch);
+            m_sawCharacterClass = false;
         }
 
         /*
          * atomBuiltInCharacterClass():
          *
          * Adds a built-in character class, called by parseEscape().
          */
         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
         {
+            // The first part of this guard handles things like /[a-\d]/ and the
+            // second part handles things like /[\w-\s]/.
+            if (m_state == cachedCharacterHyphen ||
+                (m_sawCharacterClass && (m_state == cachedCharacter) && m_character == '-')) {
+                // If the RHS of a range does not contain exacly one character then a SyntaxError
+                // must be thrown.
+                // Assumes none of the built in character classes contain a single character.
+                m_err = CharacterClassRangeSingleChar;
+                m_state = empty;
+                return;
+            }
             flush();
             m_delegate.atomCharacterClassBuiltIn(classID, invert);
+            m_sawCharacterClass = true;
         }
 
         /*
          * end():
          *
          * Called at end of construction.
          */
         void end()
@@ -179,16 +202,18 @@ private:
     
         Delegate& m_delegate;
         ErrorCode& m_err;
         enum CharacterClassConstructionState {
             empty,
             cachedCharacter,
             cachedCharacterHyphen
         } m_state;
+        // Used to verify that there is not a character class on the LHS of a range.
+        bool m_sawCharacterClass;
         UChar m_character;
     };
 
     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
         : m_delegate(delegate)
         , m_backReferenceLimit(backReferenceLimit)
         , m_err(NoError)
         , m_data(const_cast<UString &>(pattern).chars())
@@ -399,17 +424,17 @@ private:
     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
     {
         parseEscape<true>(delegate);
     }
 
     /*
      * parseCharacterClass():
      *
-     * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
+     * Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape)
      * to an instance of CharacterClassParserDelegate, to describe the character class to the
      * delegate.
      */
     void parseCharacterClass()
     {
         JS_ASSERT(!m_err);
         JS_ASSERT(peek() == '[');
         consume();