Bug 1478170 - Implement TokenStreamChars::getNonAsciiCodePoint for UTF-8. r=arai
authorJeff Walden <jwalden@mit.edu>
Fri, 29 Jun 2018 13:46:09 -0700
changeset 428554 107211d728e60e9b069c2d0107241f74b575d576
parent 428553 cafc89ca6a8788b80c279499701733e62389eb73
child 428555 38216fdba3ddea982cc4d3178c3b2c75637f71a1
push id34337
push userncsoregi@mozilla.com
push dateThu, 26 Jul 2018 21:58:45 +0000
treeherdermozilla-central@8f2f847b2f9d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersarai
bugs1478170
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1478170 - Implement TokenStreamChars::getNonAsciiCodePoint for UTF-8. r=arai
js/src/frontend/TokenStream.cpp
js/src/frontend/TokenStream.h
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@@ -876,16 +876,77 @@ TokenStreamChars<char16_t, AnyCharsAcces
     }
 
     // Otherwise we have a multi-unit code point.
     *codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
     MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
     return true;
 }
 
+template<class AnyCharsAccess>
+bool
+TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(int32_t unit, int32_t* codePoint)
+{
+    MOZ_ASSERT(unit != EOF);
+    MOZ_ASSERT(!isAsciiCodePoint(unit),
+               "ASCII code unit/point must be handled separately");
+
+    Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit));
+    MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
+               "getNonAsciiCodePoint called incorrectly");
+
+    auto onBadLeadUnit = [this, &lead]() {
+        this->badLeadUnit(lead);
+    };
+
+    auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining, uint_fast8_t required) {
+        this->notEnoughUnits(lead, remaining, required);
+    };
+
+    auto onBadTrailingUnit = [this, &lead](uint_fast8_t unitsObserved) {
+        this->badTrailingUnit(lead, unitsObserved);
+    };
+
+    auto onBadCodePoint = [this](char32_t badCodePoint, uint_fast8_t unitsObserved) {
+        this->badCodePoint(badCodePoint, unitsObserved);
+    };
+
+    auto onNotShortestForm = [this](char32_t badCodePoint, uint_fast8_t unitsObserved) {
+        this->notShortestForm(badCodePoint, unitsObserved);
+    };
+
+    // This consumes the full, valid code point or ungets |lead| and calls the
+    // appropriate error functor on failure.
+    SourceUnitsIterator iter(this->sourceUnits);
+    Maybe<char32_t> maybeCodePoint =
+        DecodeOneUtf8CodePoint(lead, &iter, SourceUnitsEnd(),
+                               onBadLeadUnit, onNotEnoughUnits, onBadTrailingUnit, onBadCodePoint,
+                               onNotShortestForm);
+    if (maybeCodePoint.isNothing())
+        return false;
+
+    char32_t cp = maybeCodePoint.value();
+    if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR || cp == unicode::PARA_SEPARATOR)) {
+        if (!updateLineInfoForEOL()) {
+#ifdef DEBUG
+            *codePoint = EOF; // sentinel value to hopefully cause errors
+#endif
+            MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
+            return false;
+        }
+
+        *codePoint = '\n';
+    } else {
+        MOZ_ASSERT(!IsLineTerminator(cp));
+        *codePoint = AssertedCast<int32_t>(cp);
+    }
+
+    return true;
+}
+
 template<>
 size_t
 SourceUnits<char16_t>::findWindowStart(size_t offset) const
 {
     // This is JS's understanding of UTF-16 that allows lone surrogates, so
     // we have to exclude lone surrogates from [windowStart, offset) ourselves.
 
     const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@@ -2013,18 +2013,16 @@ class TokenStreamChars<char16_t, AnyChar
     using GeneralCharsBase::getCodeUnit;
     using SpecializedCharsBase::infallibleGetNonAsciiCodePointDontNormalize;
     using TokenStreamCharsShared::isAsciiCodePoint;
     using CharsBase::matchLineTerminator;
     // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
     using GeneralCharsBase::ungetCodeUnit;
     using GeneralCharsBase::updateLineInfoForEOL;
 
-    using typename GeneralCharsBase::SourceUnits;
-
   protected:
     using GeneralCharsBase::GeneralCharsBase;
 
     /**
      * Given the non-ASCII |lead| code unit just consumed, consume and return a
      * complete non-ASCII code point.  Line/column updates are not performed,
      * and line breaks are returned as-is without normalization.
      */
@@ -2069,16 +2067,17 @@ class TokenStreamChars<mozilla::Utf8Unit
     using typename SpecializedCharsBase::SourceUnitsEnd;
     using typename SpecializedCharsBase::SourceUnitsIterator;
 
   protected:
     using GeneralCharsBase::anyCharsAccess;
     using GeneralCharsBase::internalComputeLineOfContext;
     using TokenStreamCharsShared::isAsciiCodePoint;
     // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
+    using GeneralCharsBase::updateLineInfoForEOL;
 
   private:
     static char toHexChar(uint8_t nibble) {
         MOZ_ASSERT(nibble < 16);
         return "0123456789ABCDEF"[nibble];
     }
 
     static void byteToString(uint8_t n, char* str) {
@@ -2177,16 +2176,28 @@ class TokenStreamChars<mozilla::Utf8Unit
      * Given the non-ASCII |lead| code unit just consumed, consume the rest of
      * a non-ASCII code point.  The code point is not normalized: on success
      * |*codePoint| may be U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR.
      *
      * Report an error if an invalid code point is encountered.
      */
     MOZ_MUST_USE bool
     getNonAsciiCodePointDontNormalize(mozilla::Utf8Unit lead, char32_t* codePoint);
+
+    /**
+     * Given a just-consumed non-ASCII code unit |lead|, consume a full code
+     * point or LineTerminatorSequence (normalizing it to '\n') and store it in
+     * |*codePoint|.  Return true on success, otherwise return false and leave
+     * |*codePoint| undefined on failure.
+     *
+     * If a LineTerminatorSequence was consumed, also update line/column info.
+     *
+     * This function will change the current |sourceUnits| offset.
+     */
+    MOZ_MUST_USE bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
 };
 
 // TokenStream is the lexical scanner for JavaScript source text.
 //
 // It takes a buffer of CharT code units (currently only char16_t encoding
 // UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
 // linearly scans it into |Token|s.
 //