Bug 1199775 - mozilla::Tokenizer improvements vol 2. r=nfroyd
authorHonza Bambas <honzab.moz@firemni.cz>
Wed, 02 Sep 2015 06:20:00 +0200
changeset 296230 3f38aafec98d7a2c170b069eb4cec3508659334c
parent 296229 c5d35e37655161dbcbfb7002f6eddcaff2c85c80
child 296231 aa5459a6703d6f96d45f7d5f58eaf49fec542b65
push id962
push userjlund@mozilla.com
push dateFri, 04 Dec 2015 23:28:54 +0000
treeherdermozilla-release@23a2d286e80f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersnfroyd
bugs1199775
milestone43.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1199775 - mozilla::Tokenizer improvements vol 2. r=nfroyd
xpcom/ds/Tokenizer.cpp
xpcom/ds/Tokenizer.h
xpcom/tests/gtest/TestTokenizer.cpp
--- a/xpcom/ds/Tokenizer.cpp
+++ b/xpcom/ds/Tokenizer.cpp
@@ -37,16 +37,19 @@ Tokenizer::Next(Token& aToken)
 {
   if (!HasInput()) {
     mHasFailed = true;
     return false;
   }
 
   mRollback = mCursor;
   mCursor = Parse(aToken);
+
+  aToken.AssignFragment(mRollback, mCursor);
+
   mPastEof = aToken.Type() == TOKEN_EOF;
   mHasFailed = false;
   return true;
 }
 
 bool
 Tokenizer::Check(const TokenType aTokenType, Token& aResult)
 {
@@ -58,17 +61,21 @@ Tokenizer::Check(const TokenType aTokenT
   nsACString::const_char_iterator next = Parse(aResult);
   if (aTokenType != aResult.Type()) {
     mHasFailed = true;
     return false;
   }
 
   mRollback = mCursor;
   mCursor = next;
+
+  aResult.AssignFragment(mRollback, mCursor);
+
   mPastEof = aResult.Type() == TOKEN_EOF;
+  mHasFailed = false;
   return true;
 }
 
 bool
 Tokenizer::Check(const Token& aToken)
 {
   if (!HasInput()) {
     mHasFailed = true;
@@ -80,16 +87,17 @@ Tokenizer::Check(const Token& aToken)
   if (!aToken.Equals(parsed)) {
     mHasFailed = true;
     return false;
   }
 
   mRollback = mCursor;
   mCursor = next;
   mPastEof = parsed.Type() == TOKEN_EOF;
+  mHasFailed = false;
   return true;
 }
 
 bool
 Tokenizer::HasFailed() const
 {
   return mHasFailed;
 }
@@ -143,16 +151,29 @@ Tokenizer::ReadChar(char* aValue)
     return false;
   }
 
   *aValue = t.AsChar();
   return true;
 }
 
 bool
+Tokenizer::ReadChar(bool (*aClassifier)(const char aChar), char* aValue)
+{
+  MOZ_RELEASE_ASSERT(aValue);
+
+  if (!CheckChar(aClassifier)) {
+    return false;
+  }
+
+  *aValue = *mRollback;
+  return true;
+}
+
+bool
 Tokenizer::ReadWord(nsACString& aValue)
 {
   Token t;
   if (!Check(TOKEN_WORD, t)) {
     return false;
   }
 
   aValue.Assign(t.AsString());
@@ -194,16 +215,25 @@ void
 Tokenizer::Claim(nsACString& aResult, ClaimInclusion aInclusion)
 {
   nsACString::const_char_iterator close = aInclusion == EXCLUDE_LAST
     ? mRollback
     : mCursor;
   aResult.Assign(Substring(mRecord, close));
 }
 
+void
+Tokenizer::Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclusion)
+{
+  nsACString::const_char_iterator close = aInclusion == EXCLUDE_LAST
+    ? mRollback
+    : mCursor;
+  aResult.Rebind(mRecord, close - mRecord);
+}
+
 // protected
 
 bool
 Tokenizer::HasInput() const
 {
   return !mPastEof;
 }
 
@@ -342,16 +372,23 @@ Tokenizer::Token::operator=(const Token&
 {
   mType = aOther.mType;
   mChar = aOther.mChar;
   mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
   mInteger = aOther.mInteger;
   return *this;
 }
 
+void
+Tokenizer::Token::AssignFragment(nsACString::const_char_iterator begin,
+                                 nsACString::const_char_iterator end)
+{
+  mFragment.Rebind(begin, end - begin);
+}
+
 // static
 Tokenizer::Token
 Tokenizer::Token::Word(const nsACString& aValue)
 {
   Token t;
   t.mType = TOKEN_WORD;
   t.mWord.Rebind(aValue.BeginReading(), aValue.Length());
   return t;
--- a/xpcom/ds/Tokenizer.h
+++ b/xpcom/ds/Tokenizer.h
@@ -43,16 +43,25 @@ public:
    * the Tokenizer's methods.
    */
   class Token {
     TokenType mType;
     nsDependentCSubstring mWord;
     char mChar;
     uint64_t mInteger;
 
+    // If this token is a result of the parsing process, this member is referencing
+    // a sub-string in the input buffer.  If this is externally created Token this
+    // member is left an empty string.
+    nsDependentCSubstring mFragment;
+
+    friend class Tokenizer;
+    void AssignFragment(nsACString::const_char_iterator begin,
+                        nsACString::const_char_iterator end);
+
   public:
     Token() : mType(TOKEN_UNKNOWN), mChar(0), mInteger(0) {}
     Token(const Token& aOther);
     Token& operator=(const Token& aOther);
 
     // Static constructors of tokens by type and value
     static Token Word(const nsACString& aWord);
     static Token Char(const char aChar);
@@ -65,16 +74,18 @@ public:
     // Compares the two tokens, type must be identical and value
     // of one of the tokens must be 'any' or equal.
     bool Equals(const Token& aOther) const;
 
     TokenType Type() const { return mType; }
     char AsChar() const;
     nsDependentCSubstring AsString() const;
     uint64_t AsInteger() const;
+
+    nsDependentCSubstring Fragment() const { return mFragment; }
   };
 
 public:
   /**
    * @param aSource
    *    The string to parse.
    *    IMPORTANT NOTE: Tokenizer doesn't ensure the input string buffer lifetime.
    *    It's up to the consumer to make sure the string's buffer outlives the Tokenizer!
@@ -194,16 +205,17 @@ public:
    */
   MOZ_WARN_UNUSED_RESULT
   bool CheckEOF() { return Check(Token::EndOfFile()); }
 
   /**
    * These are shortcuts to obtain the value immediately when the token type matches.
    */
   bool ReadChar(char* aValue);
+  bool ReadChar(bool (*aClassifier)(const char aChar), char* aValue);
   bool ReadWord(nsACString& aValue);
   bool ReadWord(nsDependentCSubstring& aValue);
 
   /**
    * This is an integer read helper.  It returns false and doesn't move the read
    * cursor when any of the following happens:
    *  - the token at the read cursor is not an integer
    *  - the final number doesn't fit the T type
@@ -268,16 +280,17 @@ public:
    */
   void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
   /**
    * Claim result of the record started with Record() call before.  Depending on aInclude
    * the ending of the sub-string result includes or excludes the last parsed or checked
    * token.
    */
   void Claim(nsACString& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
+  void Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
 
 protected:
   // false if we have already read the EOF token.
   bool HasInput() const;
   // Main parsing function, it doesn't shift the read cursor, just returns the next
   // token position.
   nsACString::const_char_iterator Parse(Token& aToken) const;
   // Is read cursor at the end?
@@ -285,35 +298,35 @@ protected:
   // Is read cursor on a character that is a word start?
   bool IsWordFirst(const char aInput) const;
   // Is read cursor on a character that is an in-word letter?
   bool IsWord(const char aInput) const;
   // Is read cursor on a character that is a valid number?
   // TODO - support multiple radix
   bool IsNumber(const char aInput) const;
 
-private:
-  Tokenizer() = delete;
-  Tokenizer(const Tokenizer&) = delete;
-  Tokenizer(Tokenizer&&) = delete;
-  Tokenizer(const Tokenizer&&) = delete;
-  Tokenizer &operator=(const Tokenizer&) = delete;
-
   // true iff we have already read the EOF token
   bool mPastEof;
   // true iff the last Check*() call has returned false, reverts to true on Rollback() call
   bool mHasFailed;
 
   // Customizable list of whitespaces
   const char* mWhitespaces;
   // Additinal custom word characters
   const char* mAdditionalWordChars;
 
   // All these point to the original buffer passed to the Tokenizer
   nsACString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is
   nsACString::const_char_iterator mRollback; // Position of the previous token start
   nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
   nsACString::const_char_iterator mEnd; // End of the input position
+
+private:
+  Tokenizer() = delete;
+  Tokenizer(const Tokenizer&) = delete;
+  Tokenizer(Tokenizer&&) = delete;
+  Tokenizer(const Tokenizer&&) = delete;
+  Tokenizer &operator=(const Tokenizer&) = delete;
 };
 
 } // mozilla
 
 #endif // Tokenizer_h__
--- a/xpcom/tests/gtest/TestTokenizer.cpp
+++ b/xpcom/tests/gtest/TestTokenizer.cpp
@@ -267,30 +267,39 @@ TEST(Tokenizer, HasFailed)
   // HasFailed test
 
   Tokenizer p1(NS_LITERAL_CSTRING("a b"));
 
   while (p1.Next(t) && t.Type() != Tokenizer::TOKEN_CHAR);
   EXPECT_TRUE(p1.HasFailed());
 
 
-  Tokenizer p2(NS_LITERAL_CSTRING("a b"));
+  Tokenizer p2(NS_LITERAL_CSTRING("a b ?!c"));
 
   EXPECT_FALSE(p2.CheckChar('c'));
   EXPECT_TRUE(p2.HasFailed());
   EXPECT_TRUE(p2.CheckChar(HttpHeaderCharacter));
   EXPECT_FALSE(p2.HasFailed());
   p2.SkipWhites();
   EXPECT_FALSE(p2.HasFailed());
+  EXPECT_FALSE(p2.CheckChar('c'));
+  EXPECT_TRUE(p2.HasFailed());
   EXPECT_TRUE(p2.Next(t));
   EXPECT_FALSE(p2.HasFailed());
   EXPECT_TRUE(p2.Next(t));
   EXPECT_FALSE(p2.HasFailed());
   EXPECT_FALSE(p2.CheckChar('c'));
   EXPECT_TRUE(p2.HasFailed());
+  EXPECT_TRUE(p2.Check(Tokenizer::TOKEN_CHAR, t));
+  EXPECT_FALSE(p2.HasFailed());
+  EXPECT_FALSE(p2.CheckChar('#'));
+  EXPECT_TRUE(p2.HasFailed());
+  t = Tokenizer::Token::Char('!');
+  EXPECT_TRUE(p2.Check(t));
+  EXPECT_FALSE(p2.HasFailed());
 
   while (p2.Next(t) && t.Type() != Tokenizer::TOKEN_CHAR);
   EXPECT_TRUE(p2.HasFailed());
 }
 
 TEST(Tokenizer, Construction)
 {
   {
@@ -387,16 +396,120 @@ TEST(Tokenizer, ShortcutChecks)
   EXPECT_TRUE(test2 == "test2");
   EXPECT_TRUE(p.ReadChar(&comma));
   EXPECT_TRUE(comma == ',');
   EXPECT_TRUE(p.ReadInteger(&integer));
   EXPECT_TRUE(integer == 123);
   EXPECT_TRUE(p.CheckEOF());
 }
 
+static bool ABChar(const char aChar)
+{
+  return aChar == 'a' || aChar == 'b';
+}
+
+TEST(Tokenizer, ReadCharClassified)
+{
+  Tokenizer p("abc");
+
+  char c;
+  EXPECT_TRUE(p.ReadChar(ABChar, &c));
+  EXPECT_TRUE(c == 'a');
+  EXPECT_TRUE(p.ReadChar(ABChar, &c));
+  EXPECT_TRUE(c == 'b');
+  EXPECT_FALSE(p.ReadChar(ABChar, &c));
+  nsDependentCSubstring w;
+  EXPECT_TRUE(p.ReadWord(w));
+  EXPECT_TRUE(w == "c");
+  EXPECT_TRUE(p.CheckEOF());
+}
+
+TEST(Tokenizer, ClaimSubstring)
+{
+  Tokenizer p(" abc ");
+
+  EXPECT_TRUE(p.CheckWhite());
+
+  p.Record();
+  EXPECT_TRUE(p.CheckWord("abc"));
+  nsDependentCSubstring v;
+  p.Claim(v, Tokenizer::INCLUDE_LAST);
+  EXPECT_TRUE(v == "abc");
+  EXPECT_TRUE(p.CheckWhite());
+  EXPECT_TRUE(p.CheckEOF());
+}
+
+TEST(Tokenizer, Fragment)
+{
+  const char str[] = "ab;cd:10 ";
+  Tokenizer p(str);
+  nsDependentCSubstring f;
+
+  Tokenizer::Token t1, t2;
+
+  EXPECT_TRUE(p.Next(t1));
+  EXPECT_TRUE(t1.Type() == Tokenizer::TOKEN_WORD);
+  EXPECT_TRUE(t1.Fragment() == "ab");
+  EXPECT_TRUE(t1.Fragment().BeginReading() == &str[0]);
+
+  p.Rollback();
+  EXPECT_TRUE(p.Check(Tokenizer::TOKEN_WORD, t2));
+  EXPECT_TRUE(t2.Fragment() == "ab");
+  EXPECT_TRUE(t2.Fragment().BeginReading() == &str[0]);
+
+
+  EXPECT_TRUE(p.Next(t1));
+  EXPECT_TRUE(t1.Type() == Tokenizer::TOKEN_CHAR);
+  EXPECT_TRUE(t1.Fragment() == ";");
+  EXPECT_TRUE(t1.Fragment().BeginReading() == &str[2]);
+
+  p.Rollback();
+  EXPECT_TRUE(p.Check(Tokenizer::TOKEN_CHAR, t2));
+  EXPECT_TRUE(t2.Fragment() == ";");
+  EXPECT_TRUE(t2.Fragment().BeginReading() == &str[2]);
+
+
+  EXPECT_TRUE(p.Check(Tokenizer::TOKEN_WORD, t2));
+  EXPECT_TRUE(t2.Fragment() == "cd");
+  EXPECT_TRUE(t2.Fragment().BeginReading() == &str[3]);
+
+  p.Rollback();
+  EXPECT_TRUE(p.Next(t1));
+  EXPECT_TRUE(t1.Type() == Tokenizer::TOKEN_WORD);
+  EXPECT_TRUE(t1.Fragment() == "cd");
+  EXPECT_TRUE(t1.Fragment().BeginReading() == &str[3]);
+
+
+  EXPECT_TRUE(p.Check(Tokenizer::TOKEN_CHAR, t2));
+  EXPECT_TRUE(t2.Fragment() == ":");
+  EXPECT_TRUE(t2.Fragment().BeginReading() == &str[5]);
+
+  p.Rollback();
+  EXPECT_TRUE(p.Next(t1));
+  EXPECT_TRUE(t1.Type() == Tokenizer::TOKEN_CHAR);
+  EXPECT_TRUE(t1.Fragment() == ":");
+  EXPECT_TRUE(t1.Fragment().BeginReading() == &str[5]);
+
+
+  EXPECT_TRUE(p.Next(t1));
+  EXPECT_TRUE(t1.Type() == Tokenizer::TOKEN_INTEGER);
+  EXPECT_TRUE(t1.Fragment() == "10");
+  EXPECT_TRUE(t1.Fragment().BeginReading() == &str[6]);
+
+
+  EXPECT_TRUE(p.Check(Tokenizer::TOKEN_WS, t2));
+  EXPECT_TRUE(t2.Fragment() == " ");
+  EXPECT_TRUE(t2.Fragment().BeginReading() == &str[8]);
+
+
+  EXPECT_TRUE(p.Check(Tokenizer::TOKEN_EOF, t1));
+  EXPECT_TRUE(t1.Fragment() == "");
+  EXPECT_TRUE(t1.Fragment().BeginReading() == &str[9]);
+}
+
 TEST(Tokenizer, SkipWhites)
 {
   Tokenizer p("Text1 \nText2 \nText3\n Text4\n ");
 
   EXPECT_TRUE(p.CheckWord("Text1"));
   p.SkipWhites();
   EXPECT_TRUE(p.CheckEOL());