new file mode 100644
--- /dev/null
+++ b/xpcom/ds/IncrementalTokenizer.cpp
@@ -0,0 +1,195 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+* License, v. 2.0. If a copy of the MPL was not distributed with this
+* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/IncrementalTokenizer.h"
+
+#include "mozilla/AutoRestore.h"
+
+#include "nsIInputStream.h"
+#include "IncrementalTokenizer.h"
+#include <algorithm>
+
+namespace mozilla {
+
+IncrementalTokenizer::IncrementalTokenizer(Consumer aConsumer,
+ const char * aWhitespaces,
+ const char * aAdditionalWordChars,
+ uint32_t aRawMinBuffered)
+ : TokenizerBase(aWhitespaces, aAdditionalWordChars)
+#ifdef DEBUG
+ , mConsuming(false)
+#endif
+ , mNeedMoreInput(false)
+ , mRollback(false)
+ , mInputCursor(0)
+ , mConsumer(aConsumer)
+{
+ mInputFinished = false;
+ mMinRawDelivery = aRawMinBuffered;
+}
+
+nsresult IncrementalTokenizer::FeedInput(const nsACString & aInput)
+{
+ NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED);
+ MOZ_ASSERT(!mInputFinished);
+
+ mInput.Cut(0, mInputCursor);
+ mInputCursor = 0;
+
+ mInput.Append(aInput);
+
+ return Process();
+}
+
+nsresult IncrementalTokenizer::FeedInput(nsIInputStream * aInput, uint32_t aCount)
+{
+ NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED);
+ MOZ_ASSERT(!mInputFinished);
+ MOZ_ASSERT(!mConsuming);
+
+ mInput.Cut(0, mInputCursor);
+ mInputCursor = 0;
+
+ nsresult rv = NS_OK;
+ while (NS_SUCCEEDED(rv) && aCount) {
+ nsCString::index_type remainder = mInput.Length();
+ nsCString::index_type load =
+ std::min<nsCString::index_type>(aCount, PR_UINT32_MAX - remainder);
+
+ if (!load) {
+ // To keep the API simple, we fail if the input data buffer if filled.
+ // It's highly unlikely there will ever be such amout of data cumulated
+ // unless a logic fault in the consumer code.
+ NS_ERROR("IncrementalTokenizer consumer not reading data?");
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+
+ if (!mInput.SetLength(remainder + load, fallible)) {
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+
+ nsCString::char_iterator buffer = mInput.BeginWriting() + remainder;
+
+ uint32_t read;
+ rv = aInput->Read(buffer, load, &read);
+ if (NS_SUCCEEDED(rv)) {
+ // remainder + load fits the uint32_t size, so must remainder + read.
+ mInput.SetLength(remainder + read);
+ aCount -= read;
+
+ rv = Process();
+ }
+ }
+
+ return rv;
+}
+
+nsresult IncrementalTokenizer::FinishInput()
+{
+ NS_ENSURE_TRUE(mConsumer, NS_ERROR_NOT_INITIALIZED);
+ MOZ_ASSERT(!mInputFinished);
+ MOZ_ASSERT(!mConsuming);
+
+ mInput.Cut(0, mInputCursor);
+ mInputCursor = 0;
+
+ mInputFinished = true;
+ nsresult rv = Process();
+ mConsumer = nullptr;
+ return rv;
+}
+
+bool IncrementalTokenizer::Next(Token & aToken)
+{
+ // Assert we are called only from the consumer callback
+ MOZ_ASSERT(mConsuming);
+
+ if (mPastEof) {
+ return false;
+ }
+
+ nsACString::const_char_iterator next = Parse(aToken);
+ mPastEof = aToken.Type() == TOKEN_EOF;
+ if (next == mCursor && !mPastEof) {
+ // Not enough input to make a deterministic decision.
+ return false;
+ }
+
+ AssignFragment(aToken, mCursor, next);
+ mCursor = next;
+ return true;
+}
+
+void IncrementalTokenizer::NeedMoreInput()
+{
+ // Assert we are called only from the consumer callback
+ MOZ_ASSERT(mConsuming);
+
+ // When the input has been finished, we can't set the flag to prevent
+ // indefinite wait for more input (that will never come)
+ mNeedMoreInput = !mInputFinished;
+}
+
+void IncrementalTokenizer::Rollback()
+{
+ // Assert we are called only from the consumer callback
+ MOZ_ASSERT(mConsuming);
+
+ mRollback = true;
+}
+
+nsresult IncrementalTokenizer::Process()
+{
+#ifdef DEBUG
+ // Assert we are not re-entered
+ MOZ_ASSERT(!mConsuming);
+
+ AutoRestore<bool> consuming(mConsuming);
+ mConsuming = true;
+#endif
+
+ MOZ_ASSERT(!mPastEof);
+
+ nsresult rv = NS_OK;
+
+ mInput.BeginReading(mCursor);
+ mCursor += mInputCursor;
+ mInput.EndReading(mEnd);
+
+ while (NS_SUCCEEDED(rv) && !mPastEof) {
+ Token token;
+ nsACString::const_char_iterator next = Parse(token);
+ mPastEof = token.Type() == TOKEN_EOF;
+ if (next == mCursor && !mPastEof) {
+ // Not enough input to make a deterministic decision.
+ break;
+ }
+
+ AssignFragment(token, mCursor, next);
+
+ nsACString::const_char_iterator rollback = mCursor;
+ mCursor = next;
+
+ mNeedMoreInput = mRollback = false;
+
+ rv = mConsumer(token, *this);
+ if (NS_FAILED(rv)) {
+ break;
+ }
+ if (mNeedMoreInput || mRollback) {
+ mCursor = rollback;
+ mPastEof = false;
+ if (mNeedMoreInput) {
+ break;
+ }
+ }
+ }
+
+ mInputCursor = mCursor - mInput.BeginReading();
+ return rv;
+}
+
+} // mozilla
new file mode 100644
--- /dev/null
+++ b/xpcom/ds/IncrementalTokenizer.h
@@ -0,0 +1,122 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+* License, v. 2.0. If a copy of the MPL was not distributed with this
+* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef INCREMENTAL_TOKENIZER_H__
+#define INCREMENTAL_TOKENIZER_H__
+
+#include "mozilla/Tokenizer.h"
+
+#include "nsError.h"
+#include <functional>
+
+class nsIInputStream;
+
+namespace mozilla {
+
+class IncrementalTokenizer : public TokenizerBase
+{
+public:
+ /**
+ * The consumer callback. The function is called for every single token
+ * as found in the input. Failure result returned by this callback stops
+ * the tokenization immediately and bubbles to result of Feed/FinishInput.
+ *
+ * Fragment()s of consumed tokens are ensured to remain valid until next call to
+ * Feed/FinishInput and are pointing to a single linear buffer. Hence, those can
+ * be safely used to accumulate the data for processing after Feed/FinishInput
+ * returned.
+ */
+ typedef std::function<nsresult(Token const&, IncrementalTokenizer& i)> Consumer;
+
+ /**
+ * For aWhitespaces and aAdditionalWordChars arguments see TokenizerBase.
+ *
+ * @param aConsumer
+ * A mandatory non-null argument, a function that consumes the tokens as they
+ * come when the tokenizer is fed.
+ * @param aRawMinBuffered
+ * When we have buffered at least aRawMinBuffered data, but there was no custom
+ * token found so far because of too small incremental feed chunks, deliver
+ * the raw data to preserve streaming and to save memory. This only has effect
+ * in OnlyCustomTokenizing mode.
+ */
+ explicit IncrementalTokenizer(Consumer aConsumer,
+ const char* aWhitespaces = nullptr,
+ const char* aAdditionalWordChars = nullptr,
+ uint32_t aRawMinBuffered = 1024);
+
+ /**
+ * Pushes the input to be tokenized. These directly call the Consumer callback
+ * on every found token. Result of the Consumer callback is returned here.
+ *
+ * The tokenizer must be initialized with a valid consumer prior call to these
+ * methods. It's not allowed to call Feed/FinishInput from inside the Consumer
+ * callback.
+ */
+ nsresult FeedInput(const nsACString& aInput);
+ nsresult FeedInput(nsIInputStream* aInput, uint32_t aCount);
+ nsresult FinishInput();
+
+ /**
+ * Can only be called from inside the consumer callback.
+ *
+ * When there is still anything to read from the input, tokenize it, store
+ * the token type and value to aToken result and shift the cursor past this
+ * just parsed token. Each call to Next() reads another token from
+ * the input and shifts the cursor.
+ *
+ * Returns false if there is not enough data to deterministically recognize
+ * tokens or when the last returned token was EOF.
+ */
+ MOZ_MUST_USE
+ bool Next(Token& aToken);
+
+ /**
+ * Can only be called from inside the consumer callback.
+ *
+ * Tells the tokenizer to revert the cursor and stop the async parsing until
+ * next feed of the input. This is useful when more than one token is needed
+ * to decide on the syntax but there is not enough input to get a next token
+ * (Next() returned false.)
+ */
+ void NeedMoreInput();
+
+ /**
+ * Can only be called from inside the consumer callback.
+ *
+ * This makes the consumer callback be called again while parsing
+ * the input at the previous cursor position again. This is useful when
+ * the tokenizer state (custom tokens, tokenization mode) has changed and
+ * we want to re-parse the input again.
+ */
+ void Rollback();
+
+private:
+ // Loops over the input with TokenizerBase::Parse and calls the Consumer callback.
+ nsresult Process();
+
+#ifdef DEBUG
+ // True when inside the consumer callback, used only for assertions.
+ bool mConsuming;
+#endif // DEBUG
+ // Modifyable only from the Consumer callback, tells the parser to break, rollback
+ // and wait for more input.
+ bool mNeedMoreInput;
+ // Modifyable only from the Consumer callback, tells the parser to rollback and
+ // parse the input again, with (if modified) new settings of the tokenizer.
+ bool mRollback;
+ // The input buffer. Updated with each call to Feed/FinishInput.
+ nsCString mInput;
+ // Numerical index pointing at the current cursor position. We don't keep direct
+ // reference to the string buffer since the buffer gets often reallocated.
+ nsCString::index_type mInputCursor;
+ // Refernce to the consumer function.
+ Consumer mConsumer;
+};
+
+} // mozilla
+
+#endif
--- a/xpcom/ds/Tokenizer.cpp
+++ b/xpcom/ds/Tokenizer.cpp
@@ -2,29 +2,28 @@
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "Tokenizer.h"
#include "nsUnicharUtils.h"
+#include <algorithm>
namespace mozilla {
static const char sWhitespaces[] = " \t";
Tokenizer::Tokenizer(const nsACString& aSource,
const char* aWhitespaces,
const char* aAdditionalWordChars)
- : mPastEof(false)
- , mHasFailed(false)
- , mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces)
- , mAdditionalWordChars(aAdditionalWordChars)
+ : TokenizerBase(aWhitespaces, aAdditionalWordChars)
{
+ mInputFinished = true;
aSource.BeginReading(mCursor);
mRecord = mRollback = mCursor;
aSource.EndReading(mEnd);
}
Tokenizer::Tokenizer(const char* aSource,
const char* aWhitespaces,
const char* aAdditionalWordChars)
@@ -38,17 +37,17 @@ Tokenizer::Next(Token& aToken)
if (!HasInput()) {
mHasFailed = true;
return false;
}
mRollback = mCursor;
mCursor = Parse(aToken);
- aToken.AssignFragment(mRollback, mCursor);
+ AssignFragment(aToken, mRollback, mCursor);
mPastEof = aToken.Type() == TOKEN_EOF;
mHasFailed = false;
return true;
}
bool
Tokenizer::Check(const TokenType aTokenType, Token& aResult)
@@ -62,17 +61,17 @@ Tokenizer::Check(const TokenType aTokenT
if (aTokenType != aResult.Type()) {
mHasFailed = true;
return false;
}
mRollback = mCursor;
mCursor = next;
- aResult.AssignFragment(mRollback, mCursor);
+ AssignFragment(aResult, mRollback, mCursor);
mPastEof = aResult.Type() == TOKEN_EOF;
mHasFailed = false;
return true;
}
bool
Tokenizer::Check(const Token& aToken)
@@ -91,22 +90,16 @@ Tokenizer::Check(const Token& aToken)
mRollback = mCursor;
mCursor = next;
mPastEof = parsed.Type() == TOKEN_EOF;
mHasFailed = false;
return true;
}
-bool
-Tokenizer::HasFailed() const
-{
- return mHasFailed;
-}
-
void
Tokenizer::SkipWhites(WhiteSkipping aIncludeNewLines)
{
if (!CheckWhite() && (aIncludeNewLines == DONT_INCLUDE_NEW_LINE || !CheckEOL())) {
return;
}
nsACString::const_char_iterator rollback = mRollback;
@@ -270,34 +263,166 @@ void
Tokenizer::Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclusion)
{
nsACString::const_char_iterator close = aInclusion == EXCLUDE_LAST
? mRollback
: mCursor;
aResult.Rebind(mRecord, close - mRecord);
}
-// protected
+// TokenizerBase
+
+TokenizerBase::TokenizerBase(const char* aWhitespaces,
+ const char* aAdditionalWordChars)
+ : mPastEof(false)
+ , mHasFailed(false)
+ , mInputFinished(true)
+ , mMode(Mode::FULL)
+ , mMinRawDelivery(1024)
+ , mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces)
+ , mAdditionalWordChars(aAdditionalWordChars)
+ , mCursor(nullptr)
+ , mEnd(nullptr)
+ , mNextCustomTokenID(TOKEN_CUSTOM0)
+{
+}
+
+TokenizerBase::Token
+TokenizerBase::AddCustomToken(const nsACString & aValue,
+ ECaseSensitivity aCaseInsensitivity, bool aEnabled)
+{
+ MOZ_ASSERT(!aValue.IsEmpty());
+
+ UniquePtr<Token>& t = *mCustomTokens.AppendElement();
+ t = MakeUnique<Token>();
+
+ t->mType = static_cast<TokenType>(++mNextCustomTokenID);
+ t->mCustomCaseInsensitivity = aCaseInsensitivity;
+ t->mCustomEnabled = aEnabled;
+ t->mCustom.Assign(aValue);
+ return *t;
+}
+
+void
+TokenizerBase::RemoveCustomToken(Token& aToken)
+{
+ if (aToken.mType == TOKEN_UNKNOWN) {
+ // Already removed
+ return;
+ }
+
+ for (UniquePtr<Token> const& custom : mCustomTokens) {
+ if (custom->mType == aToken.mType) {
+ mCustomTokens.RemoveElement(custom);
+ aToken.mType = TOKEN_UNKNOWN;
+ return;
+ }
+ }
+
+ MOZ_ASSERT(false, "Token to remove not found");
+}
+
+void
+TokenizerBase::EnableCustomToken(Token const& aToken, bool aEnabled)
+{
+ if (aToken.mType == TOKEN_UNKNOWN) {
+ // Already removed
+ return;
+ }
+
+ for (UniquePtr<Token> const& custom : mCustomTokens) {
+ if (custom->Type() == aToken.Type()) {
+ // This effectively destroys the token instance.
+ custom->mCustomEnabled = aEnabled;
+ return;
+ }
+ }
+
+ MOZ_ASSERT(false, "Token to change not found");
+}
+
+void
+TokenizerBase::SetTokenizingMode(Mode aMode)
+{
+ mMode = aMode;
+}
bool
-Tokenizer::HasInput() const
+TokenizerBase::HasFailed() const
+{
+ return mHasFailed;
+}
+
+bool
+TokenizerBase::HasInput() const
{
return !mPastEof;
}
nsACString::const_char_iterator
-Tokenizer::Parse(Token& aToken) const
+TokenizerBase::Parse(Token& aToken) const
{
if (mCursor == mEnd) {
+ if (!mInputFinished) {
+ return mCursor;
+ }
+
aToken = Token::EndOfFile();
return mEnd;
}
+ nsACString::size_type available = mEnd - mCursor;
+
+ uint32_t longestCustom = 0;
+ for (UniquePtr<Token> const& custom : mCustomTokens) {
+ if (IsCustom(mCursor, *custom, &longestCustom)) {
+ aToken = *custom;
+ return mCursor + custom->mCustom.Length();
+ }
+ }
+
+ if (!mInputFinished && available < longestCustom) {
+ // Not enough data to deterministically decide.
+ return mCursor;
+ }
+
nsACString::const_char_iterator next = mCursor;
+ if (mMode == Mode::CUSTOM_ONLY) {
+ // We have to do a brute-force search for all of the enabled custom
+ // tokens.
+ while (next < mEnd) {
+ ++next;
+ for (UniquePtr<Token> const& custom : mCustomTokens) {
+ if (IsCustom(next, *custom)) {
+ aToken = Token::Raw();
+ return next;
+ }
+ }
+ }
+
+ if (mInputFinished) {
+ // End of the data reached.
+ aToken = Token::Raw();
+ return next;
+ }
+
+ if (longestCustom < available && available > mMinRawDelivery) {
+ // We can return some data w/o waiting for either a custom token
+ // or call to FinishData() when we leave the tail where all the
+ // custom tokens potentially fit, so we can't lose only partially
+ // delivered tokens. This preserves reasonable granularity.
+ aToken = Token::Raw();
+ return mEnd - longestCustom + 1;
+ }
+
+ // Not enough data to deterministically decide.
+ return mCursor;
+ }
+
enum State {
PARSE_INTEGER,
PARSE_WORD,
PARSE_CRLF,
PARSE_LF,
PARSE_WS,
PARSE_CHAR,
} state;
@@ -321,36 +446,45 @@ Tokenizer::Parse(Token& aToken) const
while (next < mEnd) {
switch (state) {
case PARSE_INTEGER:
// Keep it simple for now
resultingNumber *= 10;
resultingNumber += static_cast<uint64_t>(*next - '0');
++next;
+ if (IsPending(next)) {
+ break;
+ }
if (IsEnd(next) || !IsNumber(*next)) {
if (!resultingNumber.isValid()) {
aToken = Token::Error();
} else {
aToken = Token::Number(resultingNumber.value());
}
return next;
}
break;
case PARSE_WORD:
++next;
+ if (IsPending(next)) {
+ break;
+ }
if (IsEnd(next) || !IsWord(*next)) {
aToken = Token::Word(Substring(mCursor, next));
return next;
}
break;
case PARSE_CRLF:
++next;
+ if (IsPending(next)) {
+ break;
+ }
if (!IsEnd(next) && *next == '\n') { // LF is optional
++next;
}
aToken = Token::NewLine();
return next;
case PARSE_LF:
++next;
@@ -364,146 +498,210 @@ Tokenizer::Parse(Token& aToken) const
case PARSE_CHAR:
++next;
aToken = Token::Char(*mCursor);
return next;
} // switch (state)
} // while (next < end)
- return next;
+ MOZ_ASSERT(!mInputFinished);
+ return mCursor;
}
bool
-Tokenizer::IsEnd(const nsACString::const_char_iterator& caret) const
+TokenizerBase::IsEnd(const nsACString::const_char_iterator& caret) const
{
return caret == mEnd;
}
bool
-Tokenizer::IsWordFirst(const char aInput) const
+TokenizerBase::IsPending(const nsACString::const_char_iterator& caret) const
+{
+ return IsEnd(caret) && !mInputFinished;
+}
+
+bool
+TokenizerBase::IsWordFirst(const char aInput) const
{
// TODO: make this fully work with unicode
return (ToLowerCase(static_cast<uint32_t>(aInput)) !=
ToUpperCase(static_cast<uint32_t>(aInput))) ||
'_' == aInput ||
(mAdditionalWordChars ? !!strchr(mAdditionalWordChars, aInput) : false);
}
bool
-Tokenizer::IsWord(const char aInput) const
+TokenizerBase::IsWord(const char aInput) const
{
return IsWordFirst(aInput) || IsNumber(aInput);
}
bool
-Tokenizer::IsNumber(const char aInput) const
+TokenizerBase::IsNumber(const char aInput) const
{
// TODO: are there unicode numbers?
return aInput >= '0' && aInput <= '9';
}
-// Tokenizer::Token
+bool
+TokenizerBase::IsCustom(const nsACString::const_char_iterator & caret,
+ const Token & aCustomToken,
+ uint32_t * aLongest) const
+{
+ MOZ_ASSERT(aCustomToken.mType > TOKEN_CUSTOM0);
+ if (!aCustomToken.mCustomEnabled) {
+ return false;
+ }
+
+ if (aLongest) {
+ *aLongest = std::max(*aLongest, aCustomToken.mCustom.Length());
+ }
+
+ uint32_t inputLength = mEnd - caret;
+ if (aCustomToken.mCustom.Length() > inputLength) {
+ return false;
+ }
-Tokenizer::Token::Token(const Token& aOther)
+ nsDependentCSubstring inputFragment(caret, aCustomToken.mCustom.Length());
+ if (aCustomToken.mCustomCaseInsensitivity == CASE_INSENSITIVE) {
+ return inputFragment.Equals(aCustomToken.mCustom, nsCaseInsensitiveUTF8StringComparator());
+ }
+ return inputFragment.Equals(aCustomToken.mCustom);
+}
+
+void TokenizerBase::AssignFragment(Token& aToken,
+ nsACString::const_char_iterator begin,
+ nsACString::const_char_iterator end)
+{
+ aToken.AssignFragment(begin, end);
+}
+
+// TokenizerBase::Token
+
+TokenizerBase::Token::Token()
+ : mType(TOKEN_UNKNOWN)
+ , mChar(0)
+ , mInteger(0)
+ , mCustomCaseInsensitivity(CASE_SENSITIVE)
+ , mCustomEnabled(false)
+{
+}
+
+TokenizerBase::Token::Token(const Token& aOther)
: mType(aOther.mType)
+ , mCustom(aOther.mCustom)
, mChar(aOther.mChar)
, mInteger(aOther.mInteger)
+ , mCustomCaseInsensitivity(aOther.mCustomCaseInsensitivity)
+ , mCustomEnabled(aOther.mCustomEnabled)
{
- if (mType == TOKEN_WORD) {
+ if (mType == TOKEN_WORD || mType > TOKEN_CUSTOM0) {
mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
}
}
-Tokenizer::Token&
-Tokenizer::Token::operator=(const Token& aOther)
+TokenizerBase::Token&
+TokenizerBase::Token::operator=(const Token& aOther)
{
mType = aOther.mType;
+ mCustom = aOther.mCustom;
mChar = aOther.mChar;
mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
mInteger = aOther.mInteger;
+ mCustomCaseInsensitivity = aOther.mCustomCaseInsensitivity;
+ mCustomEnabled = aOther.mCustomEnabled;
return *this;
}
void
-Tokenizer::Token::AssignFragment(nsACString::const_char_iterator begin,
- nsACString::const_char_iterator end)
+TokenizerBase::Token::AssignFragment(nsACString::const_char_iterator begin,
+ nsACString::const_char_iterator end)
{
mFragment.Rebind(begin, end - begin);
}
// static
-Tokenizer::Token
-Tokenizer::Token::Word(const nsACString& aValue)
+TokenizerBase::Token
+TokenizerBase::Token::Raw()
+{
+ Token t;
+ t.mType = TOKEN_RAW;
+ return t;
+}
+
+// static
+TokenizerBase::Token
+TokenizerBase::Token::Word(const nsACString& aValue)
{
Token t;
t.mType = TOKEN_WORD;
t.mWord.Rebind(aValue.BeginReading(), aValue.Length());
return t;
}
// static
-Tokenizer::Token
-Tokenizer::Token::Char(const char aValue)
+TokenizerBase::Token
+TokenizerBase::Token::Char(const char aValue)
{
Token t;
t.mType = TOKEN_CHAR;
t.mChar = aValue;
return t;
}
// static
-Tokenizer::Token
-Tokenizer::Token::Number(const uint64_t aValue)
+TokenizerBase::Token
+TokenizerBase::Token::Number(const uint64_t aValue)
{
Token t;
t.mType = TOKEN_INTEGER;
t.mInteger = aValue;
return t;
}
// static
-Tokenizer::Token
-Tokenizer::Token::Whitespace()
+TokenizerBase::Token
+TokenizerBase::Token::Whitespace()
{
Token t;
t.mType = TOKEN_WS;
t.mChar = '\0';
return t;
}
// static
-Tokenizer::Token
-Tokenizer::Token::NewLine()
+TokenizerBase::Token
+TokenizerBase::Token::NewLine()
{
Token t;
t.mType = TOKEN_EOL;
return t;
}
// static
-Tokenizer::Token
-Tokenizer::Token::EndOfFile()
+TokenizerBase::Token
+TokenizerBase::Token::EndOfFile()
{
Token t;
t.mType = TOKEN_EOF;
return t;
}
// static
-Tokenizer::Token
-Tokenizer::Token::Error()
+TokenizerBase::Token
+TokenizerBase::Token::Error()
{
Token t;
t.mType = TOKEN_ERROR;
return t;
}
bool
-Tokenizer::Token::Equals(const Token& aOther) const
+TokenizerBase::Token::Equals(const Token& aOther) const
{
if (mType != aOther.mType) {
return false;
}
switch (mType) {
case TOKEN_INTEGER:
return AsInteger() == aOther.AsInteger();
@@ -512,29 +710,29 @@ Tokenizer::Token::Equals(const Token& aO
case TOKEN_CHAR:
return AsChar() == aOther.AsChar();
default:
return true;
}
}
char
-Tokenizer::Token::AsChar() const
+TokenizerBase::Token::AsChar() const
{
MOZ_ASSERT(mType == TOKEN_CHAR || mType == TOKEN_WS);
return mChar;
}
nsDependentCSubstring
-Tokenizer::Token::AsString() const
+TokenizerBase::Token::AsString() const
{
MOZ_ASSERT(mType == TOKEN_WORD);
return mWord;
}
uint64_t
-Tokenizer::Token::AsInteger() const
+TokenizerBase::Token::AsInteger() const
{
MOZ_ASSERT(mType == TOKEN_INTEGER);
return mInteger;
}
} // mozilla
--- a/xpcom/ds/Tokenizer.h
+++ b/xpcom/ds/Tokenizer.h
@@ -4,66 +4,76 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef Tokenizer_h__
#define Tokenizer_h__
#include "nsString.h"
#include "mozilla/CheckedInt.h"
+#include "mozilla/UniquePtr.h"
+#include "nsTArray.h"
namespace mozilla {
-/**
- * This is a simple implementation of a lexical analyzer or maybe better
- * called a tokenizer. It doesn't allow any user dictionaries or
- * user define token types.
- *
- * It is limited only to ASCII input for now. UTF-8 or any other input
- * encoding must yet be implemented.
- */
-class Tokenizer {
+class TokenizerBase
+{
public:
/**
* The analyzer works with elements in the input cut to a sequence of token
* where each token has an elementary type
*/
- enum TokenType {
+ enum TokenType : uint32_t
+ {
TOKEN_UNKNOWN,
+ TOKEN_RAW,
TOKEN_ERROR,
TOKEN_INTEGER,
TOKEN_WORD,
TOKEN_CHAR,
TOKEN_WS,
TOKEN_EOL,
- TOKEN_EOF
+ TOKEN_EOF,
+ TOKEN_CUSTOM0 = 1000
+ };
+
+ enum ECaseSensitivity
+ {
+ CASE_SENSITIVE,
+ CASE_INSENSITIVE
};
/**
* Class holding the type and the value of a token. It can be manually created
* to allow checks against it via methods of Tokenizer or are results of some of
* the Tokenizer's methods.
*/
- class Token {
+ class Token
+ {
TokenType mType;
nsDependentCSubstring mWord;
+ nsCString mCustom;
char mChar;
uint64_t mInteger;
+ ECaseSensitivity mCustomCaseInsensitivity;
+ bool mCustomEnabled;
// If this token is a result of the parsing process, this member is referencing
// a sub-string in the input buffer. If this is externally created Token this
// member is left an empty string.
nsDependentCSubstring mFragment;
- friend class Tokenizer;
+ friend class TokenizerBase;
void AssignFragment(nsACString::const_char_iterator begin,
nsACString::const_char_iterator end);
+ static Token Raw();
+
public:
- Token() : mType(TOKEN_UNKNOWN), mChar(0), mInteger(0) {}
+ Token();
Token(const Token& aOther);
Token& operator=(const Token& aOther);
// Static constructors of tokens by type and value
static Token Word(const nsACString& aWord);
static Token Char(const char aChar);
static Token Number(const uint64_t aNumber);
static Token Whitespace();
@@ -78,16 +88,130 @@ public:
TokenType Type() const { return mType; }
char AsChar() const;
nsDependentCSubstring AsString() const;
uint64_t AsInteger() const;
nsDependentCSubstring Fragment() const { return mFragment; }
};
+ /**
+ * Consumers may register a custom string that, when found in the input, is considered
+ * a token and returned by Next*() and accepted by Check*() methods.
+ * AddCustomToken() returns a reference to a token that can then be comapred using
+ * Token::Equals() againts the output from Next*() or be passed to Check*().
+ */
+ Token AddCustomToken(const nsACString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true);
+ template <uint32_t N>
+ Token AddCustomToken(const char(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true)
+ {
+ return AddCustomToken(nsDependentCSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled);
+ }
+ void RemoveCustomToken(Token& aToken);
+ /**
+ * Only applies to a custom type of a Token (see AddCustomToken above.)
+ * This turns on and off token recognition. When a custom token is disabled,
+ * it's ignored as never added as a custom token.
+ */
+ void EnableCustomToken(Token const& aToken, bool aEnable);
+
+ /**
+ * Mode of tokenization.
+ * FULL tokenization, the default, recognizes built-in tokens and any custom tokens,
+ * if added.
+ * CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'.
+ * This mode can be understood as a 'binary' mode.
+ */
+ enum class Mode
+ {
+ FULL,
+ CUSTOM_ONLY
+ };
+ void SetTokenizingMode(Mode aMode);
+
+ /**
+ * Return false iff the last Check*() call has returned false or when we've read past
+ * the end of the input string.
+ */
+ MOZ_MUST_USE bool HasFailed() const;
+
+protected:
+ explicit TokenizerBase(const char* aWhitespaces = nullptr,
+ const char* aAdditionalWordChars = nullptr);
+
+ // false if we have already read the EOF token.
+ bool HasInput() const;
+ // Main parsing function, it doesn't shift the read cursor, just returns the next
+ // token position.
+ nsACString::const_char_iterator Parse(Token& aToken) const;
+ // Is read cursor at the end?
+ bool IsEnd(const nsACString::const_char_iterator& caret) const;
+ // True, when we are at the end of the input data, but it has not been marked
+ // as complete yet. In that case we cannot proceed with providing a multi-char token.
+ bool IsPending(const nsACString::const_char_iterator & caret) const;
+ // Is read cursor on a character that is a word start?
+ bool IsWordFirst(const char aInput) const;
+ // Is read cursor on a character that is an in-word letter?
+ bool IsWord(const char aInput) const;
+ // Is read cursor on a character that is a valid number?
+ // TODO - support multiple radix
+ bool IsNumber(const char aInput) const;
+ // Is equal to the given custom token?
+ bool IsCustom(const nsACString::const_char_iterator& caret,
+ const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
+
+ // Friendly helper to assign a fragment on a Token
+ static void AssignFragment(Token& aToken,
+ nsACString::const_char_iterator begin,
+ nsACString::const_char_iterator end);
+
+ // true iff we have already read the EOF token
+ bool mPastEof;
+ // true iff the last Check*() call has returned false, reverts to true on Rollback() call
+ bool mHasFailed;
+ // true if the input string is final (finished), false when we expect more data
+ // yet to be fed to the tokenizer (see IncrementalTokenizer derived class).
+ bool mInputFinished;
+ // custom only vs full tokenizing mode, see the Parse() method
+ Mode mMode;
+ // minimal raw data chunked delivery during incremental feed
+ uint32_t mMinRawDelivery;
+
+ // Customizable list of whitespaces
+ const char* mWhitespaces;
+ // Additinal custom word characters
+ const char* mAdditionalWordChars;
+
+ // All these point to the original buffer passed to the constructor or to the incremental
+ // buffer after FeedInput.
+ nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
+ nsACString::const_char_iterator mEnd; // End of the input position
+
+ // This is the list of tokens user has registered with AddCustomToken()
+ nsTArray<UniquePtr<Token>> mCustomTokens;
+ uint32_t mNextCustomTokenID;
+
+private:
+ TokenizerBase() = delete;
+ TokenizerBase(const TokenizerBase&) = delete;
+ TokenizerBase(TokenizerBase&&) = delete;
+ TokenizerBase(const TokenizerBase&&) = delete;
+ TokenizerBase &operator=(const TokenizerBase&) = delete;
+};
+
+/**
+ * This is a simple implementation of a lexical analyzer or maybe better
+ * called a tokenizer. It doesn't allow any user dictionaries or
+ * user define token types.
+ *
+ * It is limited only to ASCII input for now. UTF-8 or any other input
+ * encoding must yet be implemented.
+ */
+class Tokenizer : public TokenizerBase
+{
public:
/**
* @param aSource
* The string to parse.
* IMPORTANT NOTE: Tokenizer doesn't ensure the input string buffer lifetime.
* It's up to the consumer to make sure the string's buffer outlives the Tokenizer!
* @param aWhitespaces
* If non-null Tokenizer will use this custom set of whitespaces for CheckWhite()
@@ -129,23 +253,16 @@ public:
* Same as above method, just compares both token type and token value passed in aToken.
* When both the type and the value equals, shift the cursor and return true. Otherwise
* return false.
*/
MOZ_MUST_USE
bool Check(const Token& aToken);
/**
- * Return false iff the last Check*() call has returned false or when we've read past
- * the end of the input string.
- */
- MOZ_MUST_USE
- bool HasFailed() const;
-
- /**
* SkipWhites method (below) may also skip new line characters automatically.
*/
enum WhiteSkipping {
/**
* SkipWhites will only skip what is defined as a white space (default).
*/
DONT_INCLUDE_NEW_LINE = 0,
/**
@@ -307,46 +424,19 @@ public:
* position it had before ReadUntil was called.
*/
MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsDependentCSubstring& aResult,
ClaimInclusion aInclude = EXCLUDE_LAST);
MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsACString& aResult,
ClaimInclusion aInclude = EXCLUDE_LAST);
protected:
- // false if we have already read the EOF token.
- bool HasInput() const;
- // Main parsing function, it doesn't shift the read cursor, just returns the next
- // token position.
- nsACString::const_char_iterator Parse(Token& aToken) const;
- // Is read cursor at the end?
- bool IsEnd(const nsACString::const_char_iterator& caret) const;
- // Is read cursor on a character that is a word start?
- bool IsWordFirst(const char aInput) const;
- // Is read cursor on a character that is an in-word letter?
- bool IsWord(const char aInput) const;
- // Is read cursor on a character that is a valid number?
- // TODO - support multiple radix
- bool IsNumber(const char aInput) const;
-
- // true iff we have already read the EOF token
- bool mPastEof;
- // true iff the last Check*() call has returned false, reverts to true on Rollback() call
- bool mHasFailed;
-
- // Customizable list of whitespaces
- const char* mWhitespaces;
- // Additinal custom word characters
- const char* mAdditionalWordChars;
-
- // All these point to the original buffer passed to the Tokenizer
+ // All these point to the original buffer passed to the Tokenizer's constructor
nsACString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is
nsACString::const_char_iterator mRollback; // Position of the previous token start
- nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
- nsACString::const_char_iterator mEnd; // End of the input position
private:
Tokenizer() = delete;
Tokenizer(const Tokenizer&) = delete;
Tokenizer(Tokenizer&&) = delete;
Tokenizer(const Tokenizer&&) = delete;
Tokenizer &operator=(const Tokenizer&) = delete;
};
--- a/xpcom/ds/moz.build
+++ b/xpcom/ds/moz.build
@@ -57,21 +57,23 @@ EXPORTS += [
'nsStringEnumerator.h',
'nsSupportsArray.h',
'nsSupportsPrimitives.h',
'nsVariant.h',
'nsWhitespaceTokenizer.h',
]
EXPORTS.mozilla += [
+ 'IncrementalTokenizer.h',
'StickyTimeDuration.h',
'Tokenizer.h',
]
UNIFIED_SOURCES += [
+ 'IncrementalTokenizer.cpp',
'nsArray.cpp',
'nsAtomService.cpp',
'nsAtomTable.cpp',
'nsCRT.cpp',
'nsHashPropertyBag.cpp',
'nsINIParserImpl.cpp',
'nsObserverList.cpp',
'nsObserverService.cpp',
--- a/xpcom/tests/gtest/TestTokenizer.cpp
+++ b/xpcom/tests/gtest/TestTokenizer.cpp
@@ -1,15 +1,17 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "mozilla/Tokenizer.h"
+#include "mozilla/IncrementalTokenizer.h"
+#include "mozilla/Unused.h"
#include "gtest/gtest.h"
using namespace mozilla;
static bool IsOperator(char const c)
{
return c == '+' || c == '*';
}
@@ -727,8 +729,406 @@ TEST(Tokenizer, SkipUntil)
p.SkipUntil(Tokenizer::Token::Char(','));
p.Rollback();
EXPECT_TRUE(p.CheckWord("test2"));
EXPECT_TRUE(p.CheckEOF());
}
}
+
+TEST(Tokenizer, Custom)
+{
+ Tokenizer p("aaaaaacustom-1\r,custom-1,Custom-1,Custom-1,00custom-2xxxx,CUSTOM-2");
+
+ Tokenizer::Token c1 = p.AddCustomToken("custom-1", Tokenizer::CASE_INSENSITIVE);
+ Tokenizer::Token c2 = p.AddCustomToken("custom-2", Tokenizer::CASE_SENSITIVE);
+
+ // It's expected to NOT FIND the custom token if it's not on an edge
+ // between other recognizable tokens.
+ EXPECT_TRUE(p.CheckWord("aaaaaacustom"));
+ EXPECT_TRUE(p.CheckChar('-'));
+ EXPECT_TRUE(p.Check(Tokenizer::Token::Number(1)));
+ EXPECT_TRUE(p.CheckEOL());
+ EXPECT_TRUE(p.CheckChar(','));
+
+ EXPECT_TRUE(p.Check(c1));
+ EXPECT_TRUE(p.CheckChar(','));
+
+ EXPECT_TRUE(p.Check(c1));
+ EXPECT_TRUE(p.CheckChar(','));
+
+ p.EnableCustomToken(c1, false);
+ EXPECT_TRUE(p.CheckWord("Custom"));
+ EXPECT_TRUE(p.CheckChar('-'));
+ EXPECT_TRUE(p.Check(Tokenizer::Token::Number(1)));
+ EXPECT_TRUE(p.CheckChar(','));
+
+ EXPECT_TRUE(p.Check(Tokenizer::Token::Number(0)));
+ EXPECT_TRUE(p.Check(c2));
+ EXPECT_TRUE(p.CheckWord("xxxx"));
+ EXPECT_TRUE(p.CheckChar(','));
+
+ EXPECT_TRUE(p.CheckWord("CUSTOM"));
+ EXPECT_TRUE(p.CheckChar('-'));
+ EXPECT_TRUE(p.Check(Tokenizer::Token::Number(2)));
+
+ EXPECT_TRUE(p.CheckEOF());
+}
+
+TEST(Tokenizer, CustomRaw)
+{
+ Tokenizer p("aaaaaacustom-1\r,custom-1,Custom-1,Custom-1,00custom-2xxxx,CUSTOM-2");
+
+ Tokenizer::Token c1 = p.AddCustomToken("custom-1", Tokenizer::CASE_INSENSITIVE);
+ Tokenizer::Token c2 = p.AddCustomToken("custom-2", Tokenizer::CASE_SENSITIVE);
+
+ // In this mode it's expected to find all custom tokens among any kind of input.
+ p.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+
+ Tokenizer::Token t;
+
+ EXPECT_TRUE(p.Next(t));
+ EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+ EXPECT_TRUE(t.Fragment().EqualsLiteral("aaaaaa"));
+
+ EXPECT_TRUE(p.Check(c1));
+
+ EXPECT_TRUE(p.Next(t));
+ EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+ EXPECT_TRUE(t.Fragment().EqualsLiteral("\r,"));
+
+ EXPECT_TRUE(p.Check(c1));
+
+ EXPECT_TRUE(p.Next(t));
+ EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+ EXPECT_TRUE(t.Fragment().EqualsLiteral(","));
+
+ EXPECT_TRUE(p.Check(c1));
+
+ EXPECT_TRUE(p.Next(t));
+ EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+ EXPECT_TRUE(t.Fragment().EqualsLiteral(","));
+
+ EXPECT_TRUE(p.Check(c1));
+
+ EXPECT_TRUE(p.Next(t));
+ EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+ EXPECT_TRUE(t.Fragment().EqualsLiteral(",00"));
+
+ EXPECT_TRUE(p.Check(c2));
+
+ EXPECT_TRUE(p.Next(t));
+ EXPECT_TRUE(t.Type() == Tokenizer::TOKEN_RAW);
+ EXPECT_TRUE(t.Fragment().EqualsLiteral("xxxx,CUSTOM-2"));
+
+ EXPECT_TRUE(p.CheckEOF());
+}
+
+TEST(Tokenizer, Incremental)
+{
+ typedef TokenizerBase::Token Token;
+
+ int test = 0;
+ IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+ {
+ switch (++test) {
+ case 1: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test1")))); break;
+ case 2: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+ case 3: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2")))); break;
+ case 4: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+ case 5: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+ case 6: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+ case 7: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test3")))); break;
+ case 8: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+ }
+
+ return NS_OK;
+ });
+
+ NS_NAMED_LITERAL_CSTRING(input, "test1,test2,,,test3");
+ auto cur = input.BeginReading();
+ auto end = input.EndReading();
+ for (; cur < end; ++cur) {
+ i.FeedInput(nsDependentCSubstring(cur, 1));
+ }
+
+ EXPECT_TRUE(test == 6);
+ i.FinishInput();
+ EXPECT_TRUE(test == 8);
+}
+
+TEST(Tokenizer, IncrementalRollback)
+{
+ typedef TokenizerBase::Token Token;
+
+ int test = 0;
+ IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+ {
+ switch (++test) {
+ case 1: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test1")))); break;
+ case 2: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+ case 3: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2"))));
+ i.Rollback(); // so that we get the token again
+ break;
+ case 4: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test2")))); break;
+ case 5: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+ case 6: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+ case 7: EXPECT_TRUE(t.Equals(Token::Char(','))); break;
+ case 8: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("test3")))); break;
+ case 9: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+ }
+
+ return NS_OK;
+ });
+
+ NS_NAMED_LITERAL_CSTRING(input, "test1,test2,,,test3");
+ auto cur = input.BeginReading();
+ auto end = input.EndReading();
+ for (; cur < end; ++cur) {
+ i.FeedInput(nsDependentCSubstring(cur, 1));
+ }
+
+ EXPECT_TRUE(test == 7);
+ i.FinishInput();
+ EXPECT_TRUE(test == 9);
+}
+
+TEST(Tokenizer, IncrementalNeedMoreInput)
+{
+ typedef TokenizerBase::Token Token;
+
+ int test = 0;
+ IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+ {
+ Token t2;
+ switch (++test) {
+ case 1:
+ EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("a"))));
+ break;
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ EXPECT_TRUE(t.Equals(Token::Whitespace()));
+ if (i.Next(t2)) {
+ EXPECT_TRUE(test == 5);
+ EXPECT_TRUE(t2.Equals(Token::Word(NS_LITERAL_CSTRING("bb"))));
+ } else {
+ EXPECT_TRUE(test < 5);
+ i.NeedMoreInput();
+ }
+ break;
+ case 6:
+ EXPECT_TRUE(t.Equals(Token::Char(',')));
+ break;
+ case 7:
+ EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("c"))));
+ return NS_ERROR_FAILURE;
+ default:
+ EXPECT_TRUE(false);
+ break;
+ }
+
+ return NS_OK;
+ });
+
+ NS_NAMED_LITERAL_CSTRING(input, "a bb,c");
+ auto cur = input.BeginReading();
+ auto end = input.EndReading();
+
+ nsresult rv;
+ for (; cur < end; ++cur) {
+ rv = i.FeedInput(nsDependentCSubstring(cur, 1));
+ if (NS_FAILED(rv)) {
+ break;
+ }
+ }
+
+ EXPECT_TRUE(rv == NS_OK);
+ EXPECT_TRUE(test == 6);
+
+ rv = i.FinishInput();
+ EXPECT_TRUE(rv == NS_ERROR_FAILURE);
+ EXPECT_TRUE(test == 7);
+}
+
+TEST(Tokenizer, IncrementalCustom)
+{
+ typedef TokenizerBase::Token Token;
+
+ int test = 0;
+ Token custom;
+ IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+ {
+ switch (++test) {
+ case 1: EXPECT_TRUE(t.Equals(custom)); break;
+ case 2: EXPECT_TRUE(t.Equals(Token::Word(NS_LITERAL_CSTRING("bla")))); break;
+ case 3: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+ }
+
+ return NS_OK;
+ }, nullptr, "-");
+
+ custom = i.AddCustomToken("some-test", Tokenizer::CASE_SENSITIVE);
+ i.FeedInput(NS_LITERAL_CSTRING("some-"));
+ EXPECT_TRUE(test == 0);
+ i.FeedInput(NS_LITERAL_CSTRING("tes"));
+ EXPECT_TRUE(test == 0);
+ i.FeedInput(NS_LITERAL_CSTRING("tbla"));
+ EXPECT_TRUE(test == 1);
+ i.FinishInput();
+ EXPECT_TRUE(test == 3);
+}
+
+TEST(Tokenizer, IncrementalCustomRaw)
+{
+ typedef TokenizerBase::Token Token;
+
+ int test = 0;
+ Token custom;
+ IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+ {
+ switch (++test) {
+ case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("test1,")); break;
+ case 2: EXPECT_TRUE(t.Equals(custom)); break;
+ case 3: EXPECT_TRUE(t.Fragment().EqualsLiteral("!,,test3"));
+ i.Rollback();
+ i.SetTokenizingMode(Tokenizer::Mode::FULL);
+ break;
+ case 4: EXPECT_TRUE(t.Equals(Token::Char('!')));
+ i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+ break;
+ case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral(",,test3")); break;
+ case 6: EXPECT_TRUE(t.Equals(custom)); break;
+ case 7: EXPECT_TRUE(t.Fragment().EqualsLiteral("tes")); break;
+ case 8: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+ }
+
+ return NS_OK;
+ });
+
+ custom = i.AddCustomToken("test2", Tokenizer::CASE_SENSITIVE);
+ i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+
+ NS_NAMED_LITERAL_CSTRING(input, "test1,test2!,,test3test2tes");
+ auto cur = input.BeginReading();
+ auto end = input.EndReading();
+ for (; cur < end; ++cur) {
+ i.FeedInput(nsDependentCSubstring(cur, 1));
+ }
+
+ EXPECT_TRUE(test == 6);
+ i.FinishInput();
+ EXPECT_TRUE(test == 8);
+}
+
+TEST(Tokenizer, IncrementalCustomRemove)
+{
+ typedef TokenizerBase::Token Token;
+
+ int test = 0;
+ Token custom;
+ IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+ {
+ switch (++test) {
+ case 1: EXPECT_TRUE(t.Equals(custom));
+ i.RemoveCustomToken(custom);
+ break;
+ case 2: EXPECT_FALSE(t.Equals(custom)); break;
+ case 3: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+ }
+
+ return NS_OK;
+ });
+
+ custom = i.AddCustomToken("custom1", Tokenizer::CASE_SENSITIVE);
+
+ NS_NAMED_LITERAL_CSTRING(input, "custom1custom1");
+ i.FeedInput(input);
+ EXPECT_TRUE(test == 1);
+ i.FinishInput();
+ EXPECT_TRUE(test == 3);
+}
+
+TEST(Tokenizer, IncrementalBuffering1)
+{
+ typedef TokenizerBase::Token Token;
+
+ int test = 0;
+ Token custom;
+ nsDependentCSubstring observedFragment;
+ IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+ {
+ switch (++test) {
+ case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("012")); break;
+ case 2: EXPECT_TRUE(t.Fragment().EqualsLiteral("3456789")); break;
+ case 3: EXPECT_TRUE(t.Equals(custom)); break;
+ case 4: EXPECT_TRUE(t.Fragment().EqualsLiteral("qwe")); break;
+ case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral("rt")); break;
+ case 6: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+ }
+
+ observedFragment.Rebind(t.Fragment().BeginReading(),
+ t.Fragment().Length());
+ return NS_OK;
+ }, nullptr, nullptr, 3);
+
+ custom = i.AddCustomToken("aaa", Tokenizer::CASE_SENSITIVE);
+ // This externally unused token is added only to check the internal algorithm
+ // does work correctly as expected when there are two different length tokens.
+ Unused << i.AddCustomToken("bb", Tokenizer::CASE_SENSITIVE);
+ i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+
+ i.FeedInput(NS_LITERAL_CSTRING("01234"));
+ EXPECT_TRUE(test == 1);
+ EXPECT_TRUE(observedFragment.EqualsLiteral("012"));
+
+ i.FeedInput(NS_LITERAL_CSTRING("5"));
+ EXPECT_TRUE(test == 1);
+ i.FeedInput(NS_LITERAL_CSTRING("6789aa"));
+ EXPECT_TRUE(test == 2);
+ EXPECT_TRUE(observedFragment.EqualsLiteral("3456789"));
+
+ i.FeedInput(NS_LITERAL_CSTRING("aqwert"));
+ EXPECT_TRUE(test == 4);
+ EXPECT_TRUE(observedFragment.EqualsLiteral("qwe"));
+
+ i.FinishInput();
+ EXPECT_TRUE(test == 6);
+}
+
+TEST(Tokenizer, IncrementalBuffering2)
+{
+ typedef TokenizerBase::Token Token;
+
+ int test = 0;
+ Token custom;
+ IncrementalTokenizer i([&](Token const& t, IncrementalTokenizer& i) -> nsresult
+ {
+ switch (++test) {
+ case 1: EXPECT_TRUE(t.Fragment().EqualsLiteral("01")); break;
+ case 2: EXPECT_TRUE(t.Fragment().EqualsLiteral("234567")); break;
+ case 3: EXPECT_TRUE(t.Fragment().EqualsLiteral("89")); break;
+ case 4: EXPECT_TRUE(t.Equals(custom)); break;
+ case 5: EXPECT_TRUE(t.Fragment().EqualsLiteral("qwert")); break;
+ case 6: EXPECT_TRUE(t.Equals(Token::EndOfFile())); break;
+ }
+ return NS_OK;
+ }, nullptr, nullptr, 3);
+
+ custom = i.AddCustomToken("aaa", Tokenizer::CASE_SENSITIVE);
+ // This externally unused token is added only to check the internal algorithm
+ // does work correctly as expected when there are two different length tokens.
+ Unused << i.AddCustomToken("bbbbb", Tokenizer::CASE_SENSITIVE);
+ i.SetTokenizingMode(Tokenizer::Mode::CUSTOM_ONLY);
+
+ i.FeedInput(NS_LITERAL_CSTRING("01234"));
+ EXPECT_TRUE(test == 0);
+ i.FeedInput(NS_LITERAL_CSTRING("5"));
+ EXPECT_TRUE(test == 1);
+ i.FeedInput(NS_LITERAL_CSTRING("6789aa"));
+ EXPECT_TRUE(test == 2);
+ i.FeedInput(NS_LITERAL_CSTRING("aqwert"));
+ EXPECT_TRUE(test == 4);
+ i.FinishInput();
+ EXPECT_TRUE(test == 6);
+}