Bug 1476866 - Add a getNonAsciiCodePointDontNormalize for use in situations that demand such. r=arai
authorJeff Walden <jwalden@mit.edu>
Mon, 09 Jul 2018 14:38:16 -0700
changeset 427435 3a4e6ae59b597084afaed1d84d1674ee556406d5
parent 427434 44b64b5a44fcfdaf086bd32b7c8038efac5bf652
child 427436 80b3a14e84c23a7215243376d1a3143a985aee8a
push id34304
push usertoros@mozilla.com
push dateFri, 20 Jul 2018 09:57:23 +0000
treeherdermozilla-central@4f12d77b4f9b [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersarai
bugs1476866
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1476866 - Add a getNonAsciiCodePointDontNormalize for use in situations that demand such. r=arai
js/src/frontend/TokenStream.cpp
js/src/frontend/TokenStream.h
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@@ -1324,30 +1324,25 @@ TokenStreamSpecific<CharT, AnyCharsAcces
                     return false;
 
                 continue;
             }
 
             if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint))
                 break;
         } else {
-            int32_t cp;
-            if (!getNonAsciiCodePoint(unit, &cp))
+            // |restoreNextRawCharAddress| undoes all gets, and this function
+            // doesn't update line/column info.
+            char32_t cp;
+            if (!getNonAsciiCodePointDontNormalize(unit, &cp))
                 return false;
 
-            codePoint = AssertedCast<uint32_t>(cp);
-
-            if (!unicode::IsIdentifierPart(codePoint)) {
-                if (MOZ_UNLIKELY(codePoint == '\n')) {
-                    // |restoreNextRawCharAddress| will undo all gets, but we
-                    // have to revert a line/column update manually.
-                    anyCharsAccess().undoInternalUpdateLineInfoForEOL();
-                }
+            codePoint = cp;
+            if (!unicode::IsIdentifierPart(codePoint))
                 break;
-            }
         }
 
         if (!appendCodePointToCharBuffer(codePoint))
             return false;
     } while (true);
 
     return true;
 }
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@@ -1313,30 +1313,53 @@ class SpecializedTokenStreamCharsBase;
 
 template<>
 class SpecializedTokenStreamCharsBase<char16_t>
   : public TokenStreamCharsBase<char16_t>
 {
     using CharsBase = TokenStreamCharsBase<char16_t>;
 
   protected:
+    using TokenStreamCharsShared::isAsciiCodePoint;
     // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
 
     using typename CharsBase::SourceUnits;
 
   protected:
     // These APIs are only usable by UTF-16-specific code.
 
     /**
      * Consume the rest of a single-line comment (but not the EOL/EOF that
      * terminates it) -- infallibly because no 16-bit code unit sequence in a
      * comment is an error.
      */
     void infallibleConsumeRestOfSingleLineComment();
 
+    /**
+     * Given |lead| already consumed, consume and return the code point encoded
+     * starting from it.  Infallible because lone surrogates in JS encode a
+     * "code point" of the same value.
+     */
+    char32_t infallibleGetNonAsciiCodePointDontNormalize(char16_t lead) {
+        MOZ_ASSERT(!isAsciiCodePoint(lead));
+        MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == lead);
+
+        // Handle single-unit code points and lone trailing surrogates.
+        if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
+            // Or handle lead surrogates not paired with trailing surrogates.
+            MOZ_UNLIKELY(this->sourceUnits.atEnd() ||
+                         !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit())))
+        {
+            return lead;
+        }
+
+        // Otherwise it's a multi-unit code point.
+        return unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
+    }
+
   protected:
     // These APIs are in both SpecializedTokenStreamCharsBase specializations
     // and so are usable in subclasses no matter what CharT is.
 
     using CharsBase::CharsBase;
 };
 
 template<>
@@ -1520,26 +1543,39 @@ class TokenStreamChars<char16_t, AnyChar
     using GeneralCharsBase::asSpecific;
 
     using typename GeneralCharsBase::TokenStreamSpecific;
 
   protected:
     using GeneralCharsBase::anyCharsAccess;
     using GeneralCharsBase::getCodeUnit;
     using SpecializedCharsBase::infallibleConsumeRestOfSingleLineComment;
+    using SpecializedCharsBase::infallibleGetNonAsciiCodePointDontNormalize;
     using TokenStreamCharsShared::isAsciiCodePoint;
     // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
     using GeneralCharsBase::ungetCodeUnit;
     using GeneralCharsBase::updateLineInfoForEOL;
 
     using typename GeneralCharsBase::SourceUnits;
 
   protected:
     using GeneralCharsBase::GeneralCharsBase;
 
+    /**
+     * Given the non-ASCII |lead| code unit just consumed, consume and return a
+     * complete non-ASCII code point.  Line/column updates are not performed,
+     * and line breaks are returned as-is without normalization.
+     */
+    MOZ_MUST_USE bool getNonAsciiCodePointDontNormalize(char16_t lead, char32_t* codePoint) {
+        // There are no encoding errors in 16-bit JS, so implement this so that
+        // the compiler knows it, too.
+        *codePoint = infallibleGetNonAsciiCodePointDontNormalize(lead);
+        return true;
+    }
+
     // Try to get the next code point, normalizing '\r', '\r\n', '\n', and the
     // Unicode line/paragraph separators into '\n'.  Also updates internal
     // line-counter state.  Return true on success and store the code point in
     // |*c|.  Return false and leave |*c| undefined on failure.
     MOZ_MUST_USE bool getCodePoint(int32_t* cp);
 
     /**
      * Given a just-consumed ASCII code unit/point |lead|, consume a full code
@@ -1721,16 +1757,17 @@ class MOZ_STACK_CLASS TokenStreamSpecifi
     using SpecializedChars::consumeRestOfSingleLineComment;
     using TokenStreamCharsShared::copyCharBufferTo;
     using TokenStreamCharsShared::drainCharBufferIntoAtom;
     using CharsBase::fillCharBufferWithTemplateStringContents;
     using SpecializedChars::getCodePoint;
     using GeneralCharsBase::getCodeUnit;
     using SpecializedChars::getFullAsciiCodePoint;
     using SpecializedChars::getNonAsciiCodePoint;
+    using SpecializedChars::getNonAsciiCodePointDontNormalize;
     using TokenStreamCharsShared::isAsciiCodePoint;
     using CharsBase::matchCodeUnit;
     using GeneralCharsBase::matchUnicodeEscapeIdent;
     using GeneralCharsBase::matchUnicodeEscapeIdStart;
     using GeneralCharsBase::newAtomToken;
     using GeneralCharsBase::newNameToken;
     using GeneralCharsBase::newNumberToken;
     using GeneralCharsBase::newRegExpToken;