Bug 898912 (part 4) - Remove unnecessary FirstCharKinds and reorder FirstCharKind handling. r=till.
authorNicholas Nethercote <nnethercote@mozilla.com>
Mon, 29 Jul 2013 18:30:23 -0700
changeset 153151 0a4f1b961a988844a3e6f37103a5f30aab073dbd
parent 153150 c996032668bf1847cdddd85e15682ceafc4aa793
child 153152 a418ca312cbd24ac8b9ee21a2c8902d0ff036bea
push id2859
push userakeybl@mozilla.com
push dateMon, 16 Sep 2013 19:14:59 +0000
treeherdermozilla-beta@87d3c51cd2bf [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstill
bugs898912
milestone25.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 898912 (part 4) - Remove unnecessary FirstCharKinds and reorder FirstCharKind handling. r=till.
js/src/frontend/TokenStream.cpp
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@@ -982,28 +982,25 @@ enum FirstCharKind {
     //
     // We represent the 'OneChar' kind with any positive value less than
     // TOK_LIMIT.  This representation lets us associate each one-char token
     // jschar with a TokenKind and thus avoid a subsequent jschar-to-TokenKind
     // conversion.
     OneChar_Min = 0,
     OneChar_Max = TOK_LIMIT - 1,
 
-    Other = TOK_LIMIT,
+    Space = TOK_LIMIT,
     Ident,
-    Dot,
-    Equals,
-    String,
     Dec,
-    Plus,
+    String,
+    EOL,
     BasePrefix,
-    Space,
-    EOL,
+    Other,
 
-    LastCharKind = EOL
+    LastCharKind = Other
 };
 
 /*
  * OneChar: 40,  41,  44,  58,  59,  63,  91,  93,  123, 125, 126:
  *          '(', ')', ',', ':', ';', '?', '[', ']', '{', '}', '~'
  * Ident:   36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z'
  * Dot:     46: '.'
  * Equals:  61: '='
@@ -1020,19 +1017,19 @@ enum FirstCharKind {
 #define T_BITNOT    TOK_BITNOT
 #define _______ Other
 static const uint8_t firstCharKinds[] = {
 /*         0        1        2        3        4        5        6        7        8        9    */
 /*   0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______,   Space,
 /*  10+ */     EOL,   Space,   Space,     EOL, _______, _______, _______, _______, _______, _______,
 /*  20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
 /*  30+ */ _______, _______,   Space, _______,  String, _______,   Ident, _______, _______,  String,
-/*  40+ */  TOK_LP,  TOK_RP, _______,    Plus, T_COMMA,_______,     Dot, _______, BasePrefix,  Dec,
+/*  40+ */  TOK_LP,  TOK_RP, _______, _______, T_COMMA,_______,  _______, _______,BasePrefix,  Dec,
 /*  50+ */     Dec,     Dec,     Dec,     Dec,     Dec,     Dec,     Dec,    Dec,  T_COLON,TOK_SEMI,
-/*  60+ */ _______,  Equals, _______,TOK_HOOK, _______,   Ident,   Ident,   Ident,   Ident,   Ident,
+/*  60+ */ _______, _______, _______,TOK_HOOK, _______,   Ident,   Ident,   Ident,   Ident,   Ident,
 /*  70+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
 /*  80+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
 /*  90+ */   Ident,  TOK_LB, _______,  TOK_RB, _______,   Ident, _______,   Ident,   Ident,   Ident,
 /* 100+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
 /* 110+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
 /* 120+ */   Ident,   Ident,   Ident,  TOK_LC, _______,  TOK_RC,T_BITNOT, _______
 };
 #undef T_COMMA
@@ -1091,40 +1088,53 @@ TokenStream::getTokenInternal()
             goto identifier;
         }
 
         goto badchar;
     }
 
     /*
      * Get the token kind, based on the first char.  The ordering of c1kind
-     * comparison is based on the frequency of tokens in real code.  Minified
-     * and non-minified code have different characteristics, mostly in that
-     * whitespace occurs much less in minified code.  Token kinds that fall in
-     * the 'Other' category typically account for less than 2% of all tokens,
-     * so their order doesn't matter much.
+     * comparison is based on the frequency of tokens in real code -- Parsemark
+     * (which represents typical JS code on the web) and the Unreal demo (which
+     * represents asm.js code).
+     *
+     *                  Parsemark   Unreal
+     *  OneChar         32.9%       39.7%
+     *  Space           25.0%        0.6%
+     *  Ident           19.2%       36.4%
+     *  Dec              7.2%        5.1%
+     *  String           7.9%        0.0%
+     *  EOL              1.7%        0.0%
+     *  BasePrefix       0.4%        4.9%
+     *  Other            5.7%       13.3%
+     *
+     * The ordering is based mostly only Parsemark frequencies, with Unreal
+     * frequencies used to break close categories (e.g. |Dec| and |String|).
+     * |Other| is biggish, but no other token kind is common enough for it to
+     * be worth adding extra values to FirstCharKind.
      */
     c1kind = FirstCharKind(firstCharKinds[c]);
 
     /*
-     * Skip over non-EOL whitespace chars.
-     */
-    if (c1kind == Space)
-        goto retry;
-
-    /*
      * Look for an unambiguous single-char token.
      */
     if (c1kind < OneChar_Max) {
         tp = newToken(-1);
         tt = (TokenKind)c1kind;
         goto out;
     }
 
     /*
+     * Skip over non-EOL whitespace chars.
+     */
+    if (c1kind == Space)
+        goto retry;
+
+    /*
      * Look for an identifier.
      */
     if (c1kind == Ident) {
         tp = newToken(-1);
         identStart = userbuf.addressOfNextRawChar() - 1;
         hadUnicodeEscape = false;
 
       identifier:
@@ -1173,46 +1183,73 @@ TokenStream::getTokenInternal()
             atom = atomize(cx, tokenbuf);
         if (!atom)
             goto error;
         tp->setName(atom->asPropertyName());
         tt = TOK_NAME;
         goto out;
     }
 
-    if (c1kind == Dot) {
+    /*
+     * Look for a decimal number.
+     */
+    if (c1kind == Dec) {
         tp = newToken(-1);
-        c = getCharIgnoreEOL();
-        if (JS7_ISDEC(c)) {
-            numStart = userbuf.addressOfNextRawChar() - 2;
+        numStart = userbuf.addressOfNextRawChar() - 1;
+
+      decimal:
+        decimalPoint = NoDecimal;
+        hasExp = false;
+        while (JS7_ISDEC(c))
+            c = getCharIgnoreEOL();
+
+        if (c == '.') {
             decimalPoint = HasDecimal;
-            hasExp = false;
-            goto decimal_dot;
+          decimal_dot:
+            do {
+                c = getCharIgnoreEOL();
+            } while (JS7_ISDEC(c));
         }
-        if (c == '.') {
-            qc = getCharIgnoreEOL();
-            if (qc == '.') {
-                tt = TOK_TRIPLEDOT;
-                goto out;
+        if (c == 'e' || c == 'E') {
+            hasExp = true;
+            c = getCharIgnoreEOL();
+            if (c == '+' || c == '-')
+                c = getCharIgnoreEOL();
+            if (!JS7_ISDEC(c)) {
+                ungetCharIgnoreEOL(c);
+                reportError(JSMSG_MISSING_EXPONENT);
+                goto error;
             }
-            ungetCharIgnoreEOL(qc);
+            do {
+                c = getCharIgnoreEOL();
+            } while (JS7_ISDEC(c));
         }
         ungetCharIgnoreEOL(c);
-        tt = TOK_DOT;
-        goto out;
-    }
+
+        if (c != EOF && IsIdentifierStart(c)) {
+            reportError(JSMSG_IDSTART_AFTER_NUMBER);
+            goto error;
+        }
 
-    if (c1kind == Equals) {
-        tp = newToken(-1);
-        if (matchChar('='))
-            tt = matchChar('=') ? TOK_STRICTEQ : TOK_EQ;
-        else if (matchChar('>'))
-            tt = TOK_ARROW;
-        else
-            tt = TOK_ASSIGN;
+        /*
+         * Unlike identifiers and strings, numbers cannot contain escaped
+         * chars, so we don't need to use tokenbuf.  Instead we can just
+         * convert the jschars in userbuf directly to the numeric value.
+         */
+        double dval;
+        const jschar *dummy;
+        if (!((decimalPoint == HasDecimal) || hasExp)) {
+            if (!GetPrefixInteger(cx, numStart, userbuf.addressOfNextRawChar(), 10, &dummy, &dval))
+                goto error;
+        } else {
+            if (!js_strtod(cx, numStart, userbuf.addressOfNextRawChar(), &dummy, &dval))
+                goto error;
+        }
+        tp->setNumber(dval, decimalPoint);
+        tt = TOK_NUMBER;
         goto out;
     }
 
     /*
      * Look for a string.
      */
     if (c1kind == String) {
         tp = newToken(-1);
@@ -1311,82 +1348,25 @@ TokenStream::getTokenInternal()
         if (!atom)
             goto error;
         tp->setAtom(atom);
         tt = TOK_STRING;
         goto out;
     }
 
     /*
-     * Look for a decimal number.
+     * Skip over EOL chars, updating line state along the way.
      */
-    if (c1kind == Dec) {
-        tp = newToken(-1);
-        numStart = userbuf.addressOfNextRawChar() - 1;
-
-      decimal:
-        decimalPoint = NoDecimal;
-        hasExp = false;
-        while (JS7_ISDEC(c))
-            c = getCharIgnoreEOL();
-
-        if (c == '.') {
-            decimalPoint = HasDecimal;
-          decimal_dot:
-            do {
-                c = getCharIgnoreEOL();
-            } while (JS7_ISDEC(c));
-        }
-        if (c == 'e' || c == 'E') {
-            hasExp = true;
-            c = getCharIgnoreEOL();
-            if (c == '+' || c == '-')
-                c = getCharIgnoreEOL();
-            if (!JS7_ISDEC(c)) {
-                ungetCharIgnoreEOL(c);
-                reportError(JSMSG_MISSING_EXPONENT);
-                goto error;
-            }
-            do {
-                c = getCharIgnoreEOL();
-            } while (JS7_ISDEC(c));
-        }
-        ungetCharIgnoreEOL(c);
-
-        if (c != EOF && IsIdentifierStart(c)) {
-            reportError(JSMSG_IDSTART_AFTER_NUMBER);
-            goto error;
-        }
-
-        /*
-         * Unlike identifiers and strings, numbers cannot contain escaped
-         * chars, so we don't need to use tokenbuf.  Instead we can just
-         * convert the jschars in userbuf directly to the numeric value.
-         */
-        double dval;
-        const jschar *dummy;
-        if (!((decimalPoint == HasDecimal) || hasExp)) {
-            if (!GetPrefixInteger(cx, numStart, userbuf.addressOfNextRawChar(), 10, &dummy, &dval))
-                goto error;
-        } else {
-            if (!js_strtod(cx, numStart, userbuf.addressOfNextRawChar(), &dummy, &dval))
-                goto error;
-        }
-        tp->setNumber(dval, decimalPoint);
-        tt = TOK_NUMBER;
-        goto out;
-    }
-
-    if (c1kind == Plus) {
-        tp = newToken(-1);
-        if (matchChar('+'))
-            tt = TOK_INC;
-        else
-            tt = matchChar('=') ? TOK_ADDASSIGN : TOK_PLUS;
-        goto out;
+    if (c1kind == EOL) {
+        /* If it's a \r\n sequence: treat as a single EOL, skip over the \n. */
+        if (c == '\r' && userbuf.hasRawChars())
+            userbuf.matchRawChar('\n');
+        updateLineInfoForEOL();
+        updateFlagsForEOL();
+        goto retry;
     }
 
     // Look for a hexadecimal, octal, or binary number.
     if (c1kind == BasePrefix) {
         tp = newToken(-1);
         int radix;
         c = getCharIgnoreEOL();
         if (c == 'x' || c == 'X') {
@@ -1461,33 +1441,57 @@ TokenStream::getTokenInternal()
         if (!GetPrefixInteger(cx, numStart, userbuf.addressOfNextRawChar(), radix, &dummy, &dval))
             goto error;
         tp->setNumber(dval, NoDecimal);
         tt = TOK_NUMBER;
         goto out;
     }
 
     /*
-     * Skip over EOL chars, updating line state along the way.
-     */
-    if (c1kind == EOL) {
-        /* If it's a \r\n sequence: treat as a single EOL, skip over the \n. */
-        if (c == '\r' && userbuf.hasRawChars())
-            userbuf.matchRawChar('\n');
-        updateLineInfoForEOL();
-        updateFlagsForEOL();
-        goto retry;
-    }
-
-    /*
      * This handles everything else.
      */
     JS_ASSERT(c1kind == Other);
     tp = newToken(-1);
     switch (c) {
+      case '.':
+        c = getCharIgnoreEOL();
+        if (JS7_ISDEC(c)) {
+            numStart = userbuf.addressOfNextRawChar() - 2;
+            decimalPoint = HasDecimal;
+            hasExp = false;
+            goto decimal_dot;
+        }
+        if (c == '.') {
+            qc = getCharIgnoreEOL();
+            if (qc == '.') {
+                tt = TOK_TRIPLEDOT;
+                goto out;
+            }
+            ungetCharIgnoreEOL(qc);
+        }
+        ungetCharIgnoreEOL(c);
+        tt = TOK_DOT;
+        break;
+
+      case '=':
+        if (matchChar('='))
+            tt = matchChar('=') ? TOK_STRICTEQ : TOK_EQ;
+        else if (matchChar('>'))
+            tt = TOK_ARROW;
+        else
+            tt = TOK_ASSIGN;
+        break;
+
+      case '+':
+        if (matchChar('+'))
+            tt = TOK_INC;
+        else
+            tt = matchChar('=') ? TOK_ADDASSIGN : TOK_PLUS;
+        break;
+
       case '\\':
         hadUnicodeEscape = matchUnicodeEscapeIdStart(&qc);
         if (hadUnicodeEscape) {
             identStart = userbuf.addressOfNextRawChar() - 6;
             goto identifier;
         }
         goto badchar;