Bug 543151, part A4: unify the inner text-scanning loops used by GatherIdent, ScanString, NextURL. r=heycam
authorZack Weinberg <zackw@panix.com>
Sat, 16 Feb 2013 18:27:53 -0500
changeset 122156 ef0da3fb98ff5cf149b91ec29152ace8a569bb29
parent 122155 a0a8c0ed0c77d9e5db93d16897be1d26fc7744eb
child 122157 86b0207c62d6f0baf4efb6f46e25dbcd92a2b254
push id24320
push userryanvm@gmail.com
push dateSun, 17 Feb 2013 12:06:45 +0000
treeherdermozilla-central@5e137a87e84f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersheycam
bugs543151
milestone21.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 543151, part A4: unify the inner text-scanning loops used by GatherIdent, ScanString, NextURL. r=heycam
layout/style/nsCSSScanner.cpp
layout/style/nsCSSScanner.h
--- a/layout/style/nsCSSScanner.cpp
+++ b/layout/style/nsCSSScanner.cpp
@@ -21,125 +21,204 @@ using mozilla::ArrayLength;
 
 static const uint8_t IS_HEX_DIGIT  = 0x01;
 static const uint8_t IS_IDSTART    = 0x02;
 static const uint8_t IS_IDCHAR     = 0x04;
 static const uint8_t IS_URL_CHAR   = 0x08;
 static const uint8_t IS_HSPACE     = 0x10;
 static const uint8_t IS_VSPACE     = 0x20;
 static const uint8_t IS_SPACE      = IS_HSPACE|IS_VSPACE;
+static const uint8_t IS_STRING     = 0x40;
 
 #define H    IS_HSPACE
 #define V    IS_VSPACE
 #define I    IS_IDCHAR
-#define U                                      IS_URL_CHAR
-#define S              IS_IDSTART
-#define UI   IS_IDCHAR                        |IS_URL_CHAR
-#define USI  IS_IDCHAR|IS_IDSTART             |IS_URL_CHAR
-#define UXI  IS_IDCHAR           |IS_HEX_DIGIT|IS_URL_CHAR
-#define UXSI IS_IDCHAR|IS_IDSTART|IS_HEX_DIGIT|IS_URL_CHAR
+#define J    IS_IDSTART
+#define U    IS_URL_CHAR
+#define S    IS_STRING
+#define X    IS_HEX_DIGIT
+
+#define SH    S|H
+#define SU    S|U
+#define SUI   S|U|I
+#define SUIJ  S|U|I|J
+#define SUIX  S|U|I|X
+#define SUIJX S|U|I|J|X
 
 static const uint8_t gLexTable[] = {
-//                                     TAB LF      FF  CR
-   0,  0,  0,  0,  0,  0,  0,  0,  0,  H,  V,  0,  V,  V,  0,  0,
-//
-   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-// SPC !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
-   H,  U,  0,  U,  U,  U,  U,  0,  0,  0,  U,  U,  U,  UI, U,  U,
-// 0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
-   UXI,UXI,UXI,UXI,UXI,UXI,UXI,UXI,UXI,UXI,U,  U,  U,  U,  U,  U,
-// @   A   B   C    D    E    F    G   H   I   J   K   L   M   N   O
-   U,UXSI,UXSI,UXSI,UXSI,UXSI,UXSI,USI,USI,USI,USI,USI,USI,USI,USI,USI,
-// P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
-   USI,USI,USI,USI,USI,USI,USI,USI,USI,USI,USI,U,  S,  U,  U,  USI,
-// `   a   b   c    d    e    f    g   h   i   j   k   l   m   n   o
-   U,UXSI,UXSI,UXSI,UXSI,UXSI,UXSI,USI,USI,USI,USI,USI,USI,USI,USI,USI,
-// p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~
-   USI,USI,USI,USI,USI,USI,USI,USI,USI,USI,USI,U,  U,  U,  U,  0
+// 00    01    02    03    04    05    06    07
+    0,    S,    S,    S,    S,    S,    S,    S,
+// 08   TAB    LF    0B    FF    CR    0E    0F
+    S,   SH,    V,    S,    V,    V,    S,    S,
+// 10    11    12    13    14    15    16    17
+    S,    S,    S,    S,    S,    S,    S,    S,
+// 18    19    1A    1B    1C    1D    1E    1F
+    S,    S,    S,    S,    S,    S,    S,    S,
+//SPC     !     "     #     $     %     &     '
+   SH,   SU,    0,   SU,   SU,   SU,   SU,    0,
+//  (     )     *     +     ,     -     .     /
+    S,    S,   SU,   SU,   SU,  SUI,   SU,   SU,
+//  0     1     2     3     4     5     6     7
+ SUIX, SUIX, SUIX, SUIX, SUIX, SUIX, SUIX, SUIX,
+//  8     9     :     ;     <     =     >     ?
+ SUIX, SUIX,   SU,   SU,   SU,   SU,   SU,   SU,
+//  @     A     B     C     D     E     F     G
+   SU,SUIJX,SUIJX,SUIJX,SUIJX,SUIJX,SUIJX, SUIJ,
+//  H     I     J     K     L     M     N     O
+ SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ,
+//  P     Q     R     S     T     U     V     W
+ SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ,
+//  X     Y     Z     [     \     ]     ^     _
+ SUIJ, SUIJ, SUIJ,   SU,    J,   SU,   SU, SUIJ,
+//  `     a     b     c     d     e     f     g
+   SU,SUIJX,SUIJX,SUIJX,SUIJX,SUIJX,SUIJX, SUIJ,
+//  h     i     j     k     l     m     n     o
+ SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ,
+//  p     q     r     s     t     u     v     w
+ SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ, SUIJ,
+//  x     y     z     {     |     }     ~    7F
+ SUIJ, SUIJ, SUIJ,   SU,   SU,   SU,   SU,    S,
 };
 
 MOZ_STATIC_ASSERT(MOZ_ARRAY_LENGTH(gLexTable) == 128,
                   "gLexTable expected to cover all 128 ASCII characters");
 
-#undef H
-#undef V
-#undef S
 #undef I
+#undef J
 #undef U
-#undef UI
-#undef USI
-#undef UXI
-#undef UXSI
+#undef S
+#undef X
+#undef SH
+#undef SU
+#undef SUI
+#undef SUIJ
+#undef SUIX
+#undef SUIJX
+
+/**
+ * True if 'ch' is in character class 'cls', which should be one of
+ * the constants above or some combination of them.  All characters
+ * above U+007F are considered to be in 'cls'.  EOF is never in 'cls'.
+ */
+static inline bool
+IsOpenCharClass(int32_t ch, uint8_t cls) {
+  return ch >= 0 && (ch >= 128 || (gLexTable[ch] & cls) != 0);
+}
 
+/**
+ * True if 'ch' is in character class 'cls', which should be one of
+ * the constants above or some combination of them.  No characters
+ * above U+007F are considered to be in 'cls'. EOF is never in 'cls'.
+ */
+static inline bool
+IsClosedCharClass(int32_t ch, uint8_t cls) {
+  return uint32_t(ch) < 128 && (gLexTable[ch] & cls) != 0;
+}
+
+/**
+ * True if 'ch' is CSS whitespace, i.e. any of the ASCII characters
+ * TAB, LF, FF, CR, or SPC.
+ */
+static inline bool
+IsWhitespace(int32_t ch) {
+  return IsClosedCharClass(ch, IS_SPACE);
+}
+
+/**
+ * True if 'ch' is horizontal whitespace, i.e. TAB or SPC.
+ */
 static inline bool
 IsHorzSpace(int32_t ch) {
-  return uint32_t(ch) < 128 && (gLexTable[ch] & IS_HSPACE) != 0;
+  return IsClosedCharClass(ch, IS_HSPACE);
 }
 
+/**
+ * True if 'ch' is vertical whitespace, i.e. LF, FF, or CR.  Vertical
+ * whitespace requires special handling when consumed, see AdvanceLine.
+ */
 static inline bool
 IsVertSpace(int32_t ch) {
-  return uint32_t(ch) < 128 && (gLexTable[ch] & IS_VSPACE) != 0;
+  return IsClosedCharClass(ch, IS_VSPACE);
 }
 
+/**
+ * True if 'ch' is a character that can appear in the middle of an
+ * identifier.
+ */
 static inline bool
-IsWhitespace(int32_t ch) {
-  return uint32_t(ch) < 128 && (gLexTable[ch] & IS_SPACE) != 0;
+IsIdentChar(int32_t ch) {
+  return IsOpenCharClass(ch, IS_IDCHAR);
 }
 
+/**
+ * True if 'ch' is a character that by itself begins an identifier.
+ * (This is a subset of IsIdentChar.)
+ */
 static inline bool
-IsIdentChar(int32_t ch) {
-  return ch >= 0 && (ch >= 128 || (gLexTable[ch] & IS_IDCHAR) != 0);
+IsIdentStart(int32_t ch) {
+  return IsOpenCharClass(ch, IS_IDSTART);
 }
 
-static inline bool
-IsIdentStart(int32_t ch) {
-  return ch >= 0 && (ch >= 128 || (gLexTable[ch] & IS_IDSTART) != 0);
-}
-
+/**
+ * True if the two-character sequence aFirstChar+aSecondChar begins an
+ * identifier.
+ */
 static inline bool
 StartsIdent(int32_t aFirstChar, int32_t aSecondChar)
 {
   return IsIdentStart(aFirstChar) ||
     (aFirstChar == '-' && IsIdentStart(aSecondChar));
 }
 
-static inline bool
-IsURLChar(int32_t ch) {
-  return ch >= 0 && (ch >= 128 || (gLexTable[ch] & IS_URL_CHAR) != 0);
-}
-
+/**
+ * True if 'ch' is a decimal digit.
+ */
 static inline bool
 IsDigit(int32_t ch) {
   return (ch >= '0') && (ch <= '9');
 }
 
+/**
+ * True if 'ch' is a hexadecimal digit.
+ */
 static inline bool
 IsHexDigit(int32_t ch) {
-  return uint32_t(ch) < 128 && (gLexTable[ch] & IS_HEX_DIGIT) != 0;
+  return IsClosedCharClass(ch, IS_HEX_DIGIT);
 }
 
+/**
+ * Assuming that 'ch' is a decimal digit, return its numeric value.
+ */
 static inline uint32_t
 DecimalDigitValue(int32_t ch)
 {
   return ch - '0';
 }
 
+/**
+ * Assuming that 'ch' is a hexadecimal digit, return its numeric value.
+ */
 static inline uint32_t
 HexDigitValue(int32_t ch)
 {
   if (IsDigit(ch)) {
     return DecimalDigitValue(ch);
   } else {
     // Note: c&7 just keeps the low three bits which causes
     // upper and lower case alphabetics to both yield their
     // "relative to 10" value for computing the hex value.
     return (ch & 0x7) + 9;
   }
 }
 
+/**
+ * If 'ch' can be the first character of a two-character match operator
+ * token, return the token type code for that token, otherwise return
+ * eCSSToken_Symbol to indicate that it can't.
+ */
 static inline nsCSSTokenType
 MatchOperatorType(int32_t ch)
 {
   switch (ch) {
   case '~': return eCSSToken_Includes;
   case '|': return eCSSToken_Dashmatch;
   case '^': return eCSSToken_Beginsmatch;
   case '$': return eCSSToken_Endsmatch;
@@ -517,75 +596,77 @@ nsCSSScanner::GatherEscape(nsString& aOu
     } else if (IsHorzSpace(ch)) {
       Advance();
     }
   }
   return true;
 }
 
 /**
- * Consume a sequence of identifier characters and escape sequences
- * starting with the current read position, and append all of them to
- * |aIdent|.  Returns true if it consumed any characters, false if it
- * did not (this can only happen when there was an invalid escape
- * sequence right at the current read position).
+ * Consume a run of "text" beginning with the current read position,
+ * consisting of characters in the class |aClass| (which must be a
+ * suitable argument to IsOpenCharClass) plus escape sequences.
+ * Append the text to |aText|, after decoding escape sequences.
+ *
+ * Returns true if at least one character was appended to |aText|,
+ * false otherwise.
  */
 bool
-nsCSSScanner::GatherIdent(nsString& aIdent)
+nsCSSScanner::GatherText(uint8_t aClass, nsString& aText)
 {
-  int32_t ch = Peek();
-  MOZ_ASSERT(IsIdentChar(ch) || ch == '\\',
-             "should not have been called");
-#ifdef DEBUG
-  uint32_t n = aIdent.Length();
-#endif
+  // This is all of the character classes currently used with
+  // GatherText.  If you have a need to use this function with a
+  // different class, go ahead and add it.
+  MOZ_ASSERT(aClass == IS_STRING ||
+             aClass == IS_IDCHAR ||
+             aClass == IS_URL_CHAR,
+             "possibly-inappropriate character class");
 
-  if (ch == '\\') {
-    if (!GatherEscape(aIdent, false)) {
-      return false;
-    }
-  }
+  uint32_t start = mOffset;
+  bool inString = aClass == IS_STRING;
+
   for (;;) {
     // Consume runs of unescaped characters in one go.
     uint32_t n = mOffset;
-    while (n < mCount && IsIdentChar(mBuffer[n])) {
+    while (n < mCount && IsOpenCharClass(mBuffer[n], aClass)) {
       n++;
     }
-    // Add to the token what we have so far.
     if (n > mOffset) {
-      aIdent.Append(&mBuffer[mOffset], n - mOffset);
+      aText.Append(&mBuffer[mOffset], n - mOffset);
       mOffset = n;
     }
+    if (n == mCount) {
+      break;
+    }
 
-    ch = Peek();
-    if (ch == '\\') {
-      if (!GatherEscape(aIdent, false)) {
-        break;
-      }
-    } else {
-      MOZ_ASSERT(!IsIdentChar(ch), "should not have exited the inner loop");
+    int32_t ch = Peek();
+    MOZ_ASSERT(!IsOpenCharClass(ch, aClass),
+               "should not have exited the inner loop");
+
+    if (ch != '\\') {
+      break;
+    }
+    if (!GatherEscape(aText, inString)) {
       break;
     }
   }
 
-  // If we get here, we should have added some characters to aIdent.
-  MOZ_ASSERT(aIdent.Length() > n);
-  return true;
+  return mOffset > start;
 }
 
 /**
  * Scan an Ident token.  This also handles Function and URL tokens,
  * both of which begin indistinguishably from an identifier.  It can
  * produce a Symbol token when an apparent identifier actually led
  * into an invalid escape sequence.
  */
 bool
 nsCSSScanner::ScanIdent(nsCSSToken& aToken)
 {
-  if (MOZ_UNLIKELY(!GatherIdent(aToken.mIdent))) {
+  if (MOZ_UNLIKELY(!GatherText(IS_IDCHAR, aToken.mIdent))) {
     aToken.mSymbol = Peek();
     Advance();
     return true;
   }
 
   if (MOZ_LIKELY(Peek() != '(')) {
     aToken.mType = eCSSToken_Ident;
     return true;
@@ -609,17 +690,17 @@ nsCSSScanner::ScanAtKeyword(nsCSSToken& 
   MOZ_ASSERT(Peek() == '@', "should not have been called");
 
   // Fall back for when '@' isn't followed by an identifier.
   aToken.mSymbol = '@';
   Advance();
 
   int32_t ch = Peek();
   if (StartsIdent(ch, Peek(1))) {
-     if (GatherIdent(aToken.mIdent)) {
+    if (GatherText(IS_IDCHAR, aToken.mIdent)) {
        aToken.mType = eCSSToken_AtKeyword;
      }
   }
   return true;
 }
 
 /**
  * Scan a Hash token.  Handles the distinction between eCSSToken_ID
@@ -635,17 +716,17 @@ nsCSSScanner::ScanHash(nsCSSToken& aToke
   aToken.mSymbol = '#';
   Advance();
 
   int32_t ch = Peek();
   if (IsIdentChar(ch) || ch == '\\') {
     nsCSSTokenType type =
       StartsIdent(ch, Peek(1)) ? eCSSToken_ID : eCSSToken_Hash;
     aToken.mIdent.SetLength(0);
-    if (GatherIdent(aToken.mIdent)) {
+    if (GatherText(IS_IDCHAR, aToken.mIdent)) {
       aToken.mType = type;
     }
   }
 
   return true;
 }
 
 /**
@@ -774,17 +855,17 @@ nsCSSScanner::ScanNumber(nsCSSToken& aTo
     aToken.mIntegerValid = true;
   }
 
   nsString& ident = aToken.mIdent;
 
   // Check for Dimension and Percentage tokens.
   if (c >= 0) {
     if (StartsIdent(c, Peek(1))) {
-      if (GatherIdent(ident)) {
+      if (GatherText(IS_IDCHAR, ident)) {
         type = eCSSToken_Dimension;
       }
     } else if (c == '%') {
       Advance();
       type = eCSSToken_Percentage;
       value = value / 100.0f;
       aToken.mIntegerValid = false;
     }
@@ -804,52 +885,36 @@ nsCSSScanner::ScanString(nsCSSToken& aTo
 {
   int32_t aStop = Peek();
   MOZ_ASSERT(aStop == '"' || aStop == '\'', "should not have been called");
   aToken.mType = eCSSToken_String;
   aToken.mSymbol = PRUnichar(aStop); // Remember how it's quoted.
   Advance();
 
   for (;;) {
-    // Consume runs of unescaped characters in one go.
-    uint32_t n = mOffset;
-    int32_t ch = -1;
-    while (n < mCount) {
-      ch = mBuffer[n];
-      if (ch == aStop || ch == '\\' || IsVertSpace(ch)) {
-        break;
-      }
-      n++;
-    }
-    if (n > mOffset) {
-      aToken.mIdent.Append(&mBuffer[mOffset], n - mOffset);
-      mOffset = n;
-    }
-    if (n == mCount) {
+    GatherText(IS_STRING, aToken.mIdent);
+
+    int32_t ch = Peek();
+    if (ch == -1) {
       break; // EOF ends a string token with no error.
     }
     if (ch == aStop) {
       Advance();
       break;
     }
-    if (IsVertSpace(ch)) {
-      aToken.mType = eCSSToken_Bad_String;
-      mReporter->ReportUnexpected("SEUnterminatedString", aToken);
-      break;
+    // Both " and ' are excluded from IS_STRING.
+    if (ch == '"' || ch == '\'') {
+      aToken.mIdent.Append(ch);
+      Advance();
+      continue;
     }
-    MOZ_ASSERT(ch == '\\', "should not have exited the inner loop");
-    if (!GatherEscape(aToken.mIdent, true)) {
-      // For strings, the only case where GatherEscape will return
-      // false is when there's a backslash to start an escape
-      // immediately followed by end-of-stream.  In that case, the
-      // backslash is not included in the Bad_String token.
-      aToken.mType = eCSSToken_Bad_String;
-      mReporter->ReportUnexpected("SEUnterminatedString", aToken);
-      break;
-    }
+
+    aToken.mType = eCSSToken_Bad_String;
+    mReporter->ReportUnexpected("SEUnterminatedString", aToken);
+    break;
   }
   return true;
 }
 
 /**
  * Scan a unicode-range token.  These match the regular expression
  *
  *     u\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?
@@ -959,44 +1024,18 @@ nsCSSScanner::NextURL(nsCSSToken& aToken
     ScanString(aToken);
     if (MOZ_UNLIKELY(aToken.mType == eCSSToken_Bad_String)) {
       aToken.mType = eCSSToken_Bad_URL;
       return true;
     }
     MOZ_ASSERT(aToken.mType == eCSSToken_String, "unexpected token type");
 
   } else {
-    // Otherwise, this is the start of a non-quoted url (which may be empty)
-    aToken.mSymbol = PRUnichar(0);
-    for (;;) {
-      // Consume runs of unescaped characters in one go.
-      uint32_t n = mOffset;
-      while (n < mCount) {
-        ch = mBuffer[n];
-        if (!IsURLChar(ch)) {
-          break;
-        }
-        n++;
-      }
-      if (n > mOffset) {
-        aToken.mIdent.Append(&mBuffer[mOffset], n - mOffset);
-        mOffset = n;
-      }
-      if (n == mCount) {
-        break; // EOF ends URL literal with no error.
-      }
-      if (ch != '\\') {
-        break;
-      }
-      if (!GatherEscape(aToken.mIdent, false)) {
-        break; // Bad escape sequence terminates URL.  The backslash
-               // remains unconsumed, so the logic below will produce a
-               // Bad_URL token.
-      }
-    }
+    // Otherwise, this is the start of a non-quoted url (which may be empty).
+    GatherText(IS_URL_CHAR, aToken.mIdent);
   }
 
   // Consume trailing whitespace and then look for a close parenthesis.
   SkipWhitespace();
   ch = Peek();
   if (MOZ_LIKELY(ch < 0 || ch == ')')) {
     Advance();
     aToken.mType = eCSSToken_URL;
--- a/layout/style/nsCSSScanner.h
+++ b/layout/style/nsCSSScanner.h
@@ -189,17 +189,17 @@ protected:
   int32_t Peek(uint32_t n = 0);
   void Advance(uint32_t n = 1);
   void AdvanceLine();
 
   void SkipWhitespace();
   void SkipComment();
 
   bool GatherEscape(nsString& aOutput, bool aInString);
-  bool GatherIdent(nsString& aIdent);
+  bool GatherText(uint8_t aClass, nsString& aIdent);
 
   bool ScanIdent(nsCSSToken& aResult);
   bool ScanAtKeyword(nsCSSToken& aResult);
   bool ScanHash(nsCSSToken& aResult);
   bool ScanNumber(nsCSSToken& aResult);
   bool ScanString(nsCSSToken& aResult);
   bool ScanURange(nsCSSToken& aResult);