Bug 1561567 - Introduce rope-walking conversion from JS strings to UTF-8 in a byte span. r=jwalden
authorHenri Sivonen <hsivonen@hsivonen.fi>
Wed, 18 Sep 2019 08:28:24 +0000
changeset 493753 6fbb5c422ee17a15cf25974156d05ab5e5331d71
parent 493752 487bbce291b0dc2d330f9e39bd5d2df2c0a84626
child 493754 ac0ff32d087b388315c47f2008c4cb07d56a6eb8
push id95669
push userhsivonen@mozilla.com
push dateWed, 18 Sep 2019 08:30:40 +0000
treeherderautoland@c7d88625df86 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjwalden
bugs1561567
milestone71.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1561567 - Introduce rope-walking conversion from JS strings to UTF-8 in a byte span. r=jwalden Differential Revision: https://phabricator.services.mozilla.com/D41941
js/public/CharacterEncoding.h
js/src/jsapi.cpp
js/src/jsapi.h
js/src/vm/StringType.cpp
js/src/vm/StringType.h
--- a/js/public/CharacterEncoding.h
+++ b/js/public/CharacterEncoding.h
@@ -280,16 +280,20 @@ LossyUTF8CharsToNewTwoByteCharsZ(JSConte
 
 /*
  * Returns the length of the char buffer required to encode |s| as UTF8.
  * Does not include the null-terminator.
  */
 JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSFlatString* s);
 
 /*
+ * Note: Unlike this function, JS_EncodeStringToUTF8BufferPartial in jsapi.h
+ * does not require flattening the string first. Consider using that function
+ * instead of this one.
+ *
  * Encode |src| as UTF8. The caller must either ensure |dst| has enough space
  * to encode the entire string or pass the length of the buffer as |dstlenp|,
  * in which case the function will encode characters from the string until
  * the buffer is exhausted. Does not write the null terminator.
  *
  * If |dstlenp| is provided, it will be updated to hold the number of bytes
  * written to the buffer. If |numcharsp| is provided, it will be updated to hold
  * the number of Unicode characters written to the buffer (which can be less
--- a/js/src/jsapi.cpp
+++ b/js/src/jsapi.cpp
@@ -4562,16 +4562,25 @@ JS_PUBLIC_API bool JS_EncodeStringToBuff
     const char16_t* src = linear->twoByteChars(nogc);
     for (size_t i = 0; i < writeLength; i++) {
       buffer[i] = char(src[i]);
     }
   }
   return true;
 }
 
+JS_PUBLIC_API mozilla::Maybe<mozilla::Tuple<size_t, size_t> >
+JS_EncodeStringToUTF8BufferPartial(JSContext* cx, JSString* str,
+                                   mozilla::Span<char> buffer) {
+  AssertHeapIsIdle();
+  CHECK_THREAD(cx);
+  JS::AutoCheckCannotGC nogc;
+  return str->encodeUTF8Partial(nogc, buffer);
+}
+
 JS_PUBLIC_API JS::Symbol* JS::NewSymbol(JSContext* cx,
                                         HandleString description) {
   AssertHeapIsIdle();
   CHECK_THREAD(cx);
   if (description) {
     cx->check(description);
   }
 
--- a/js/src/jsapi.h
+++ b/js/src/jsapi.h
@@ -6,16 +6,17 @@
 
 /* JavaScript API. */
 
 #ifndef jsapi_h
 #define jsapi_h
 
 #include "mozilla/AlreadyAddRefed.h"
 #include "mozilla/FloatingPoint.h"
+#include "mozilla/Maybe.h"
 #include "mozilla/MemoryReporting.h"
 #include "mozilla/Range.h"
 #include "mozilla/RangedPtr.h"
 #include "mozilla/RefPtr.h"
 #include "mozilla/Utf8.h"
 #include "mozilla/Variant.h"
 
 #include <stdarg.h>
@@ -2430,16 +2431,44 @@ JS_PUBLIC_API size_t JS_GetStringEncodin
  * length parameter, the string will be cut and only length bytes will be
  * written into the buffer.
  */
 MOZ_MUST_USE JS_PUBLIC_API bool JS_EncodeStringToBuffer(JSContext* cx,
                                                         JSString* str,
                                                         char* buffer,
                                                         size_t length);
 
+/**
+ * Encode as many scalar values of the string as UTF-8 as can fit
+ * into the caller-provided buffer replacing unpaired surrogates
+ * with the REPLACEMENT CHARACTER.
+ *
+ * If JS_StringHasLatin1Chars(str) returns true, the function
+ * is guaranteed to convert the entire string if
+ * buffer.Length() >= 2 * JS_GetStringLength(str). Otherwise,
+ * the function is guaranteed to convert the entire string if
+ * buffer.Length() >= 3 * JS_GetStringLength(str).
+ *
+ * This function does not alter the representation of |str| or
+ * any |JSString*| substring that is a constituent part of it.
+ * Returns mozilla::Nothing() on OOM, without reporting an error;
+ * some data may have been written to |buffer| when this happens.
+ *
+ * If there's no OOM, returns the number of code units read and
+ * the number of code units written.
+ *
+ * The semantics of this method match the semantics of
+ * TextEncoder.encodeInto().
+ *
+ * The function does not store an additional zero byte.
+ */
+JS_PUBLIC_API mozilla::Maybe<mozilla::Tuple<size_t, size_t> >
+JS_EncodeStringToUTF8BufferPartial(JSContext* cx, JSString* str,
+                                   mozilla::Span<char> buffer);
+
 namespace JS {
 
 JS_PUBLIC_API bool PropertySpecNameEqualsId(JSPropertySpec::Name name,
                                             HandleId id);
 
 /**
  * Create a jsid that does not need to be marked for GC.
  *
--- a/js/src/vm/StringType.cpp
+++ b/js/src/vm/StringType.cpp
@@ -12,31 +12,34 @@
 #include "mozilla/HashFunctions.h"
 #include "mozilla/MathAlgorithms.h"
 #include "mozilla/MemoryReporting.h"
 #include "mozilla/PodOperations.h"
 #include "mozilla/RangedPtr.h"
 #include "mozilla/TextUtils.h"
 #include "mozilla/TypeTraits.h"
 #include "mozilla/Unused.h"
+#include "mozilla/Utf8.h"
+#include "mozilla/Vector.h"
 
 #include <algorithm>    // std::{all_of,copy_n,enable_if,is_const,move}
 #include <type_traits>  // std::is_unsigned
 
 #include "jsfriendapi.h"
 
 #include "frontend/BytecodeCompiler.h"
 #include "gc/GCInternals.h"
 #include "gc/Marking.h"
 #include "gc/Nursery.h"
 #include "js/CharacterEncoding.h"
 #include "js/StableStringChars.h"
 #include "js/Symbol.h"
 #include "js/UbiNode.h"
 #include "util/StringBuffer.h"
+#include "util/Unicode.h"
 #include "vm/GeckoProfiler.h"
 
 #include "vm/GeckoProfiler-inl.h"
 #include "vm/JSContext-inl.h"
 #include "vm/JSObject-inl.h"
 #include "vm/Realm-inl.h"
 
 using namespace js;
@@ -124,16 +127,135 @@ JS::ubi::Node::Size JS::ubi::Concrete<JS
 
   size += str.sizeOfExcludingThis(mallocSizeOf);
 
   return size;
 }
 
 const char16_t JS::ubi::Concrete<JSString>::concreteTypeName[] = u"JSString";
 
+mozilla::Maybe<mozilla::Tuple<size_t, size_t> > JSString::encodeUTF8Partial(
+    const JS::AutoRequireNoGC& nogc, mozilla::Span<char> buffer) const {
+  mozilla::Vector<const JSString*, 16, SystemAllocPolicy> stack;
+  const JSString* current = this;
+  char16_t pendingLeadSurrogate = 0;  // U+0000 means no pending lead surrogate
+  size_t totalRead = 0;
+  size_t totalWritten = 0;
+  for (;;) {
+    if (current->isRope()) {
+      JSRope& rope = current->asRope();
+      if (!stack.append(rope.rightChild())) {
+        // OOM
+        return mozilla::Nothing();
+      }
+      current = rope.leftChild();
+      continue;
+    }
+
+    JSLinearString& linear = current->asLinear();
+    if (MOZ_LIKELY(linear.hasLatin1Chars())) {
+      if (MOZ_UNLIKELY(pendingLeadSurrogate)) {
+        if (buffer.Length() < 3) {
+          return mozilla::Some(mozilla::MakeTuple(totalRead, totalWritten));
+        }
+        buffer[0] = '\xEF';
+        buffer[1] = '\xBF';
+        buffer[2] = '\xBD';
+        buffer = buffer.From(3);
+        totalRead += 1;  // pendingLeadSurrogate
+        totalWritten += 3;
+        pendingLeadSurrogate = 0;
+      }
+      auto src = mozilla::AsChars(
+          mozilla::MakeSpan(linear.latin1Chars(nogc), linear.length()));
+      size_t read;
+      size_t written;
+      mozilla::Tie(read, written) =
+          mozilla::ConvertLatin1toUtf8Partial(src, buffer);
+      buffer = buffer.From(written);
+      totalRead += read;
+      totalWritten += written;
+      if (read < src.Length()) {
+        return mozilla::Some(mozilla::MakeTuple(totalRead, totalWritten));
+      }
+    } else {
+      auto src = mozilla::MakeSpan(linear.twoByteChars(nogc), linear.length());
+      if (MOZ_UNLIKELY(pendingLeadSurrogate)) {
+        char16_t first = 0;
+        if (!src.IsEmpty()) {
+          first = src[0];
+        }
+        if (unicode::IsTrailSurrogate(first)) {
+          // Got a surrogate pair
+          if (buffer.Length() < 4) {
+            return mozilla::Some(mozilla::MakeTuple(totalRead, totalWritten));
+          }
+          uint32_t astral = unicode::UTF16Decode(pendingLeadSurrogate, first);
+          buffer[0] = char(0b1111'0000 | (astral >> 18));
+          buffer[1] = char(0b1000'0000 | ((astral >> 12) & 0b11'1111));
+          buffer[2] = char(0b1000'0000 | ((astral >> 6) & 0b11'1111));
+          buffer[3] = char(0b1000'0000 | (astral & 0b11'1111));
+          src = src.From(1);
+          buffer = buffer.From(4);
+          totalRead += 2;  // both pendingLeadSurrogate and first!
+          totalWritten += 4;
+        } else {
+          // unpaired surrogate
+          if (buffer.Length() < 3) {
+            return mozilla::Some(mozilla::MakeTuple(totalRead, totalWritten));
+          }
+          buffer[0] = '\xEF';
+          buffer[1] = '\xBF';
+          buffer[2] = '\xBD';
+          buffer = buffer.From(3);
+          totalRead += 1;  // pendingLeadSurrogate
+          totalWritten += 3;
+        }
+        pendingLeadSurrogate = 0;
+      }
+      if (src.IsEmpty()) {
+        return mozilla::Some(mozilla::MakeTuple(totalRead, totalWritten));
+      }
+      char16_t last = src[src.Length() - 1];
+      if (unicode::IsLeadSurrogate(last)) {
+        src = src.To(src.Length() - 1);
+        pendingLeadSurrogate = last;
+      } else {
+        MOZ_ASSERT(!pendingLeadSurrogate);
+      }
+      size_t read;
+      size_t written;
+      mozilla::Tie(read, written) =
+          mozilla::ConvertUtf16toUtf8Partial(src, buffer);
+      buffer = buffer.From(written);
+      totalRead += read;
+      totalWritten += written;
+      if (read < src.Length()) {
+        return mozilla::Some(mozilla::MakeTuple(totalRead, totalWritten));
+      }
+    }
+    if (stack.empty()) {
+      break;
+    }
+    current = stack.popCopy();
+  }
+  if (MOZ_UNLIKELY(pendingLeadSurrogate)) {
+    if (buffer.Length() < 3) {
+      return mozilla::Some(mozilla::MakeTuple(totalRead, totalWritten));
+    }
+    buffer[0] = '\xEF';
+    buffer[1] = '\xBF';
+    buffer[2] = '\xBD';
+    // No need to update buffer and pendingLeadSurrogate anymore
+    totalRead += 1;
+    totalWritten += 3;
+  }
+  return mozilla::Some(mozilla::MakeTuple(totalRead, totalWritten));
+}
+
 #if defined(DEBUG) || defined(JS_JITSPEW)
 
 template <typename CharT>
 /*static */
 void JSString::dumpChars(const CharT* s, size_t n, js::GenericPrinter& out) {
   if (n == SIZE_MAX) {
     n = 0;
     while (s[n]) {
--- a/js/src/vm/StringType.h
+++ b/js/src/vm/StringType.h
@@ -2,16 +2,17 @@
  * vim: set ts=8 sts=2 et sw=2 tw=80:
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef vm_StringType_h
 #define vm_StringType_h
 
+#include "mozilla/Maybe.h"
 #include "mozilla/MemoryReporting.h"
 #include "mozilla/Range.h"
 #include "mozilla/TextUtils.h"
 
 #include <type_traits>  // std::is_same
 
 #include "jsapi.h"
 #include "jsfriendapi.h"
@@ -547,16 +548,36 @@ class JSString : public js::gc::CellWith
   /* Only called by the GC for strings with the AllocKind::STRING kind. */
 
   inline void finalize(JSFreeOp* fop);
 
   /* Gets the number of bytes that the chars take on the heap. */
 
   size_t sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf);
 
+  /* Encode as many scalar values of the string as UTF-8 as can fit
+   * into the caller-provided buffer replacing unpaired surrogates
+   * with the REPLACEMENT CHARACTER.
+   *
+   * Returns the number of code units read and the number of code units
+   * written.
+   *
+   * The semantics of this method match the semantics of
+   * TextEncoder.encodeInto().
+   *
+   * This function doesn't modify the representation -- rope, linear,
+   * flat, atom, etc. -- of this string. If this string is a rope,
+   * it also doesn't modify the representation of left or right halves
+   * of this string, or of those halves, and so on.
+   *
+   * Returns mozilla::Nothing on OOM.
+   */
+  mozilla::Maybe<mozilla::Tuple<size_t, size_t> > encodeUTF8Partial(
+      const JS::AutoRequireNoGC& nogc, mozilla::Span<char> buffer) const;
+
   // Make offset accessors public.
   using Base::offsetOfFlags;
   using Base::offsetOfLength;
 
  private:
   // To help avoid writing Spectre-unsafe code, we only allow MacroAssembler
   // to call the method below.
   friend class js::jit::MacroAssembler;