Bug 1478170 - Implement SourceUnits::peekCodePoint for UTF-8. r=arai
authorJeff Walden <jwalden@mit.edu>
Wed, 25 Jul 2018 14:51:26 -0700
changeset 428546 292bd4af4056fd252939f0b9a31ad71ce9270785
parent 428545 0ccbc10cd6a1b7ce53f852fb8f7f81cac1ecc31f
child 428547 7f427db8f6f88bf86df7adadf2cdeea98b757865
push id34337
push userncsoregi@mozilla.com
push dateThu, 26 Jul 2018 21:58:45 +0000
treeherdermozilla-central@8f2f847b2f9d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersarai
bugs1478170
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1478170 - Implement SourceUnits::peekCodePoint for UTF-8. r=arai
js/src/frontend/TokenStream.h
mfbt/Utf8.h
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@@ -180,16 +180,17 @@
  * TokenStreamSpecific declarer to specify this is more explicit), we couldn't.
  */
 
 #include "mozilla/ArrayUtils.h"
 #include "mozilla/Assertions.h"
 #include "mozilla/Attributes.h"
 #include "mozilla/Casting.h"
 #include "mozilla/DebugOnly.h"
+#include "mozilla/Maybe.h"
 #include "mozilla/MemoryChecking.h"
 #include "mozilla/PodOperations.h"
 #include "mozilla/TextUtils.h"
 #include "mozilla/TypeTraits.h"
 #include "mozilla/Unused.h"
 #include "mozilla/Utf8.h"
 
 #include <algorithm>
@@ -1084,16 +1085,37 @@ PeekCodePoint(const char16_t* const ptr,
     } else {
         c = unicode::UTF16Decode(lead, ptr[1]);
         len = 2;
     }
 
     return PeekedCodePoint<char16_t>(c, len);
 }
 
+inline PeekedCodePoint<mozilla::Utf8Unit>
+PeekCodePoint(const mozilla::Utf8Unit* const ptr, const mozilla::Utf8Unit* const end)
+{
+    if (MOZ_UNLIKELY(ptr >= end))
+        return PeekedCodePoint<mozilla::Utf8Unit>::none();
+
+    const mozilla::Utf8Unit lead = ptr[0];
+    if (mozilla::IsAscii(lead))
+        return PeekedCodePoint<mozilla::Utf8Unit>(lead.toUint8(), 1);
+
+    const mozilla::Utf8Unit* afterLead = ptr + 1;
+    mozilla::Maybe<char32_t> codePoint = mozilla::DecodeOneUtf8CodePoint(lead, &afterLead, end);
+    if (codePoint.isNothing())
+        return PeekedCodePoint<mozilla::Utf8Unit>::none();
+
+    auto len = mozilla::AssertedCast<uint8_t>(mozilla::PointerRangeSize(ptr, afterLead));
+    MOZ_ASSERT(len <= 4);
+
+    return PeekedCodePoint<mozilla::Utf8Unit>(codePoint.value(), len);
+}
+
 // This is the low-level interface to the JS source code buffer.  It just gets
 // raw Unicode code units -- 16-bit char16_t units of source text that are not
 // (always) full code points, and 8-bit units of UTF-8 source text soon.
 // TokenStreams functions are layered on top and do some extra stuff like
 // converting all EOL sequences to '\n', tracking the line number, and setting
 // |flags.isEOF|.  (The "raw" in "raw Unicode code units" refers to the lack of
 // EOL sequence normalization.)
 //
--- a/mfbt/Utf8.h
+++ b/mfbt/Utf8.h
@@ -193,16 +193,23 @@ public:
     return static_cast<uint8_t>(mValue);
   }
 
   // We currently don't expose |&mValue|.  |UnicodeData| sort of does, but
   // that's a somewhat separate concern, justified in different comments in
   // that other code.
 };
 
+/** Returns true iff |aUnit| is an ASCII value. */
+inline bool
+IsAscii(Utf8Unit aUnit)
+{
+  return IsAscii(aUnit.toUint8());
+}
+
 /**
  * Returns true if the given length-delimited memory consists of a valid UTF-8
  * string, false otherwise.
  *
  * A valid UTF-8 string contains no overlong-encoded code points (as one would
  * expect) and contains no code unit sequence encoding a UTF-16 surrogate.  The
  * string *may* contain U+0000 NULL code points.
  */