Bug 1467336 - Introduce code to bifurcate handling of arbitrary-code-point getting in the tokenizer to distinctly and separately handle ASCII and non-ASCII code points. r=arai
authorJeff Walden <jwalden@mit.edu>
Thu, 07 Jun 2018 03:01:38 -0700
changeset 478651 564254cf34aaac2e59ac1837a00c2cb257d3b323
parent 478650 2dac13bda7237ea6d069bed9d52f01a65f6a1c67
child 478652 5a295d6c16887bec916975e723825881142a50f2
push id1757
push userffxbld-merge
push dateFri, 24 Aug 2018 17:02:43 +0000
treeherdermozilla-release@736023aebdb1 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersarai
bugs1467336
milestone62.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1467336 - Introduce code to bifurcate handling of arbitrary-code-point getting in the tokenizer to distinctly and separately handle ASCII and non-ASCII code points. r=arai
js/src/frontend/TokenStream.cpp
js/src/frontend/TokenStream.h
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@@ -32,16 +32,17 @@
 #include "util/StringBuffer.h"
 #include "util/Unicode.h"
 #include "vm/HelperThreads.h"
 #include "vm/JSAtom.h"
 #include "vm/JSCompartment.h"
 #include "vm/JSContext.h"
 
 using mozilla::ArrayLength;
+using mozilla::IsAscii;
 using mozilla::IsAsciiAlpha;
 using mozilla::IsAsciiDigit;
 using mozilla::MakeScopeExit;
 using mozilla::PodCopy;
 
 struct ReservedWordInfo
 {
     const char* chars;         // C string with reserved word text
@@ -527,16 +528,65 @@ TokenStreamChars<char16_t, AnyCharsAcces
 
     if (!updateLineInfoForEOL())
         return false;
 
     *cp = '\n';
     return true;
 }
 
+template<class AnyCharsAccess>
+bool
+TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(char16_t lead, int32_t* codePoint)
+{
+    MOZ_ASSERT(!isAsciiCodePoint(lead),
+               "ASCII code unit/point must be handled separately");
+    MOZ_ASSERT(lead == sourceUnits.previousCodeUnit(),
+               "getNonAsciiCodePoint called incorrectly");
+
+    // The code point is usually |lead|: overwrite later if needed.
+    *codePoint = lead;
+
+    // Dispense with single-unit code points ("code points", when a lone
+    // trailing surrogate is encountered).
+    if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) {
+        if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
+                         lead == unicode::PARA_SEPARATOR))
+        {
+            if (!updateLineInfoForEOL()) {
+#ifdef DEBUG
+                *codePoint = EOF; // sentinel value to hopefully cause errors
+#endif
+                MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
+                return false;
+            }
+
+            *codePoint = '\n';
+        } else {
+            MOZ_ASSERT(!SourceUnits::isRawEOLChar(*codePoint));
+        }
+
+        return true;
+    }
+
+    // If there are no more units, or the next unit isn't a trailing surrogate,
+    // it's also a "code point".
+    if (MOZ_UNLIKELY(!sourceUnits.hasRawChars() ||
+                     !unicode::IsTrailSurrogate(sourceUnits.peekCodeUnit())))
+    {
+        MOZ_ASSERT(!SourceUnits::isRawEOLChar(*codePoint));
+        return true;
+    }
+
+    // Otherwise we have a multi-unit code point.
+    *codePoint = unicode::UTF16Decode(lead, sourceUnits.getCodeUnit());
+    MOZ_ASSERT(!SourceUnits::isRawEOLChar(*codePoint));
+    return true;
+}
+
 // This gets the next code unit -- the next numeric sub-unit of source text,
 // possibly smaller than a full code point.  It is simple and stupid, and it
 // doesn't understand EOL, update line counters, or anything like that.  If you
 // use it to consume an EOL sequence, line counters *will not* be correct for
 // subsequent code.
 //
 // Only use this if (a) the resulting code unit is guaranteed to be ungotten
 // (by ungetCodeUnit()) if it's an EOL, and (b) the line-related state (lineno,
@@ -554,17 +604,16 @@ GeneralTokenStreamChars<CharT, AnyCharsA
 
 template<typename CharT, class AnyCharsAccess>
 void
 GeneralTokenStreamChars<CharT, AnyCharsAccess>::ungetChar(int32_t c)
 {
     if (c == EOF)
         return;
 
-    MOZ_ASSERT(!sourceUnits.atStart());
     sourceUnits.ungetCodeUnit();
     if (c == '\n') {
         int32_t c2 = sourceUnits.peekCodeUnit();
         MOZ_ASSERT(SourceUnits::isRawEOLChar(c2));
 
         // If it's a \r\n sequence, also unget the \r.
         if (c2 == CharT('\n') && !sourceUnits.atStart())
             sourceUnits.ungetOptionalCRBeforeLF();
@@ -577,17 +626,16 @@ GeneralTokenStreamChars<CharT, AnyCharsA
 
 template<typename CharT>
 void
 TokenStreamCharsBase<CharT>::ungetCodeUnit(int32_t c)
 {
     if (c == EOF)
         return;
 
-    MOZ_ASSERT(!sourceUnits.atStart());
     sourceUnits.ungetCodeUnit();
 }
 
 template<class AnyCharsAccess>
 void
 TokenStreamChars<char16_t, AnyCharsAccess>::ungetCodePointIgnoreEOL(uint32_t codePoint)
 {
     MOZ_ASSERT(!sourceUnits.atStart());
@@ -2050,31 +2098,37 @@ TokenStreamSpecific<CharT, AnyCharsAcces
             }
 
             // Look for a multi-line comment.
             if (matchCodeUnit('*')) {
                 TokenStreamAnyChars& anyChars = anyCharsAccess();
                 unsigned linenoBefore = anyChars.lineno;
 
                 do {
-                    if (!getChar(&c))
-                        return badToken();
-
-                    if (c == EOF) {
+                    int32_t unit = getCodeUnit();
+                    if (unit == EOF) {
                         reportError(JSMSG_UNTERMINATED_COMMENT);
                         return badToken();
                     }
 
-                    if (c == '*' && matchCodeUnit('/'))
+                    if (unit == '*' && matchCodeUnit('/'))
                         break;
 
-                    if (c == '@' || c == '#') {
-                        bool shouldWarn = c == '@';
+                    if (unit == '@' || unit == '#') {
+                        bool shouldWarn = unit == '@';
                         if (!getDirectives(true, shouldWarn))
-                            return false;
+                            return badToken();
+                    } else if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
+                        int32_t codePoint;
+                        if (!getFullAsciiCodePoint(unit, &codePoint))
+                            return badToken();
+                    } else {
+                        int32_t codePoint;
+                        if (!getNonAsciiCodePoint(unit, &codePoint))
+                            return badToken();
                     }
                 } while (true);
 
                 if (linenoBefore != anyChars.lineno)
                     anyChars.updateFlagsForEOL();
 
                 continue;
             }
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@@ -161,16 +161,17 @@
  */
 
 #include "mozilla/ArrayUtils.h"
 #include "mozilla/Assertions.h"
 #include "mozilla/Attributes.h"
 #include "mozilla/DebugOnly.h"
 #include "mozilla/MemoryChecking.h"
 #include "mozilla/PodOperations.h"
+#include "mozilla/TextUtils.h"
 #include "mozilla/TypeTraits.h"
 #include "mozilla/Unused.h"
 
 #include <stdarg.h>
 #include <stddef.h>
 #include <stdio.h>
 
 #include "jspubtd.h"
@@ -928,16 +929,22 @@ class SourceUnits
         MOZ_ASSERT(offset - startOffset_ <= mozilla::PointerRangeSize(base_, limit_));
         return base_ + (offset - startOffset_);
     }
 
     const CharT* limit() const {
         return limit_;
     }
 
+    CharT previousCodeUnit() {
+        MOZ_ASSERT(ptr, "can't get previous code unit if poisoned");
+        MOZ_ASSERT(!atStart(), "must have a previous code unit to get");
+        return *(ptr - 1);
+    }
+
     CharT getCodeUnit() {
         return *ptr++;      // this will nullptr-crash if poisoned
     }
 
     CharT peekCodeUnit() const {
         return *ptr;        // this will nullptr-crash if poisoned
     }
 
@@ -960,16 +967,17 @@ class SourceUnits
                    "function should only be called when a '\\n' was just "
                    "ungotten, and any '\\r' preceding it must also be "
                    "ungotten");
         if (*(ptr - 1) == CharT('\r'))
             ptr--;
     }
 
     void ungetCodeUnit() {
+        MOZ_ASSERT(!atStart(), "can't unget if currently at start");
         MOZ_ASSERT(ptr);     // make sure it hasn't been poisoned
         ptr--;
     }
 
     const CharT* addressOfNextCodeUnit(bool allowPoisoned = false) const {
         MOZ_ASSERT_IF(!allowPoisoned, ptr);     // make sure it hasn't been poisoned
         return ptr;
     }
@@ -1061,16 +1069,25 @@ class TokenStreamCharsBase
                 return false;
 
             cur++;
         }
 
         return true;
     }
 
+    /**
+     * Determine whether a code unit constitutes a complete ASCII code point.
+     * (The code point's exact value might not be used, however, if subsequent
+     * code observes that |unit| is part of a LineTerminatorSequence.)
+     */
+    static MOZ_MUST_USE MOZ_ALWAYS_INLINE bool isAsciiCodePoint(CharT unit) {
+        return mozilla::IsAscii(unit);
+    }
+
   protected:
     /** Code units in the source code being tokenized. */
     SourceUnits sourceUnits;
 
     /** Current token string buffer. */
     CharBuffer tokenbuf;
 };
 
@@ -1231,20 +1248,23 @@ class TokenStreamChars<char16_t, AnyChar
 
     using typename GeneralCharsBase::TokenStreamSpecific;
 
     void matchMultiUnitCodePointSlow(char16_t lead, uint32_t* codePoint);
 
   protected:
     using GeneralCharsBase::anyCharsAccess;
     using GeneralCharsBase::getCodeUnit;
+    using CharsSharedBase::isAsciiCodePoint;
     using GeneralCharsBase::sourceUnits;
     using CharsSharedBase::ungetCodeUnit;
     using GeneralCharsBase::updateLineInfoForEOL;
 
+    using typename GeneralCharsBase::SourceUnits;
+
     using GeneralCharsBase::GeneralCharsBase;
 
     // |c| must be the code unit just gotten.  If it and the subsequent code
     // unit form a valid surrogate pair, get the second code unit, set
     // |*codePoint| to the code point encoded by the surrogate pair, and return
     // true.  Otherwise do not get a second code unit, set |*codePoint = 0|,
     // and return true.
     //
@@ -1274,16 +1294,63 @@ class TokenStreamChars<char16_t, AnyChar
     MOZ_MUST_USE bool getCodePoint(int32_t* cp);
 
     // A deprecated alias for |getCodePoint|: most code using this is being
     // replaced with different approaches.
     MOZ_MUST_USE bool getChar(int32_t* cp) {
         return getCodePoint(cp);
     }
 
+    /**
+     * Given a just-consumed ASCII code unit/point |lead|, consume a full code
+     * point or LineTerminatorSequence (normalizing it to '\n') and store it in
+     * |*codePoint|.  Return true on success, otherwise return false and leave
+     * |*codePoint| undefined on failure.
+     *
+     * If a LineTerminatorSequence was consumed, also update line/column info.
+     *
+     * This may change the current |sourceUnits| offset.
+     */
+    MOZ_MUST_USE bool getFullAsciiCodePoint(char16_t lead, int32_t* codePoint) {
+        MOZ_ASSERT(isAsciiCodePoint(lead),
+                   "non-ASCII code units must be handled separately");
+        MOZ_ASSERT(lead == sourceUnits.previousCodeUnit(),
+                   "getFullAsciiCodePoint called incorrectly");
+
+        if (MOZ_UNLIKELY(lead == '\r')) {
+            if (MOZ_LIKELY(sourceUnits.hasRawChars()))
+                sourceUnits.matchCodeUnit('\n');
+        } else if (MOZ_LIKELY(lead != '\n')) {
+            *codePoint = lead;
+            return true;
+        }
+
+        *codePoint = '\n';
+        bool ok = updateLineInfoForEOL();
+        if (!ok) {
+#ifdef DEBUG
+            *codePoint = EOF; // sentinel value to hopefully cause errors
+#endif
+            MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
+        }
+        return ok;
+    }
+
+    /**
+     * Given a just-consumed non-ASCII code unit (and maybe point) |lead|,
+     * consume a full code point or LineTerminatorSequence (normalizing it to
+     * '\n') and store it in |*codePoint|.  Return true on success, otherwise
+     * return false and leave |*codePoint| undefined on failure.
+     *
+     * If a LineTerminatorSequence was consumed, also update line/column info.
+     *
+     * This may change the current |sourceUnits| offset.
+     */
+    MOZ_MUST_USE bool getNonAsciiCodePoint(char16_t lead, int32_t* cp);
+
     void ungetCodePointIgnoreEOL(uint32_t codePoint);
 };
 
 // TokenStream is the lexical scanner for JavaScript source text.
 //
 // It takes a buffer of CharT characters (currently only char16_t encoding
 // UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
 // linearly scans it into |Token|s.
@@ -1360,16 +1427,19 @@ class MOZ_STACK_CLASS TokenStreamSpecifi
     using CharsSharedBase::atomizeChars;
     using GeneralCharsBase::badToken;
     using GeneralCharsBase::consumeRestOfSingleLineComment;
     using CharsSharedBase::copyTokenbufTo;
     using CharsSharedBase::fillWithTemplateStringContents;
     using CharsBase::getChar;
     using CharsBase::getCodePoint;
     using GeneralCharsBase::getCodeUnit;
+    using CharsBase::getFullAsciiCodePoint;
+    using CharsBase::getNonAsciiCodePoint;
+    using CharsSharedBase::isAsciiCodePoint;
     using CharsSharedBase::matchCodeUnit;
     using CharsBase::matchMultiUnitCodePoint;
     using GeneralCharsBase::newAtomToken;
     using GeneralCharsBase::newNameToken;
     using GeneralCharsBase::newNumberToken;
     using GeneralCharsBase::newRegExpToken;
     using GeneralCharsBase::newSimpleToken;
     using CharsSharedBase::sourceUnits;