Bug 1498320 - Implement ScriptSource::appendSubstring for UTF-8 source text, using a newly-implemented StringBuffer::append(const Utf8Unit* units, size_t len). r=tcampbell
authorJeff Walden <jwalden@mit.edu>
Thu, 01 Nov 2018 17:34:56 -0700
changeset 446512 774a0684f72479571a325c28442cb87bd1858c8a
parent 446511 ba4bdabbdd529c9d29357073b66b36ccf23caf74
child 446513 ba6ba95b3cd24e79ea99df4adb2ab4e7be1d7d0a
push id35042
push useraiakab@mozilla.com
push dateThu, 15 Nov 2018 09:54:38 +0000
treeherdermozilla-central@dca9c72df68b [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstcampbell
bugs1498320
milestone65.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1498320 - Implement ScriptSource::appendSubstring for UTF-8 source text, using a newly-implemented StringBuffer::append(const Utf8Unit* units, size_t len). r=tcampbell
js/public/CharacterEncoding.h
js/src/util/StringBuffer.h
js/src/vm/CharacterEncoding.cpp
js/src/vm/JSScript.cpp
--- a/js/public/CharacterEncoding.h
+++ b/js/public/CharacterEncoding.h
@@ -77,16 +77,22 @@ class UTF8Chars : public mozilla::Range<
 
     UTF8Chars() : Base() {}
     UTF8Chars(char* aBytes, size_t aLength)
       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
     {}
     UTF8Chars(const char* aBytes, size_t aLength)
       : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
     {}
+    UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
+      : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength)
+    {}
+    UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
+      : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength)
+    {}
 };
 
 /*
  * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
  */
 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
 {
     typedef mozilla::RangedPtr<unsigned char> Base;
@@ -103,16 +109,20 @@ class UTF8CharsZ : public mozilla::Range
     }
 
     UTF8CharsZ(unsigned char* aBytes, size_t aLength)
       : Base(aBytes, aLength)
     {
         MOZ_ASSERT(aBytes[aLength] == '\0');
     }
 
+    UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
+      : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength)
+    {}
+
     using Base::operator=;
 
     char* c_str() { return reinterpret_cast<char*>(get()); }
 };
 
 /*
  * A wrapper for a "const char*" that is encoded using UTF-8.
  * This class does not manage ownership of the data; that is left
--- a/js/src/util/StringBuffer.h
+++ b/js/src/util/StringBuffer.h
@@ -4,16 +4,17 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef util_StringBuffer_h
 #define util_StringBuffer_h
 
 #include "mozilla/DebugOnly.h"
 #include "mozilla/MaybeOneOf.h"
+#include "mozilla/Utf8.h"
 
 #include "js/Vector.h"
 #include "vm/JSContext.h"
 
 namespace js {
 
 /*
  * String builder that eagerly checks for over-allocation past the maximum
@@ -154,16 +155,24 @@ class StringBuffer
 
     MOZ_MUST_USE bool append(const Latin1Char* begin, const Latin1Char* end) {
         return isLatin1() ? latin1Chars().append(begin, end) : twoByteChars().append(begin, end);
     }
     MOZ_MUST_USE bool append(const Latin1Char* chars, size_t len) {
         return append(chars, chars + len);
     }
 
+    /**
+     * Interpret the provided count of UTF-8 code units as UTF-8, and append
+     * the represented code points to this.  If the code units contain invalid
+     * UTF-8, leave the internal buffer in a consistent but unspecified state,
+     * report an error, and return false.
+     */
+    MOZ_MUST_USE bool append(const mozilla::Utf8Unit* units, size_t len);
+
     MOZ_MUST_USE bool append(const JS::ConstCharPtr chars, size_t len) {
         return append(chars.get(), chars.get() + len);
     }
     MOZ_MUST_USE bool appendN(Latin1Char c, size_t n) {
         return isLatin1() ? latin1Chars().appendN(c, n) : twoByteChars().appendN(c, n);
     }
 
     inline MOZ_MUST_USE bool append(JSString* str);
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@@ -3,23 +3,29 @@
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "js/CharacterEncoding.h"
 
 #include "mozilla/Range.h"
 #include "mozilla/Sprintf.h"
+#include "mozilla/TextUtils.h"
+#include "mozilla/Utf8.h"
 
 #include <algorithm>
 #include <type_traits>
 
+#include "util/StringBuffer.h"
 #include "util/Unicode.h" // unicode::REPLACEMENT_CHARACTER
 #include "vm/JSContext.h"
 
+using mozilla::IsAscii;
+using mozilla::Utf8Unit;
+
 using namespace js;
 
 Latin1CharsZ
 JS::LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
                                        const mozilla::Range<const char16_t> tbchars)
 {
     MOZ_ASSERT(cx);
     size_t len = tbchars.length();
@@ -602,8 +608,73 @@ JS::StringIsASCII(const char* s)
     while (*s) {
         if (*s & 0x80) {
             return false;
         }
         s++;
     }
     return true;
 }
+
+bool
+StringBuffer::append(const Utf8Unit* units, size_t len)
+{
+    if (isLatin1()) {
+        Latin1CharBuffer& latin1 = latin1Chars();
+
+        while (len > 0) {
+            if (!IsAscii(*units)) {
+                break;
+            }
+
+            if (!latin1.append(units->toUnsignedChar())) {
+                return false;
+            }
+
+            ++units;
+            --len;
+        }
+        if (len == 0) {
+            return true;
+        }
+
+        // Non-ASCII doesn't *necessarily* mean we couldn't keep appending to
+        // |latin1|, but it's only possible for [U+0080, U+0100) code points,
+        // and handling the full complexity of UTF-8 only for that very small
+        // additional range isn't worth it.  Inflate to two-byte storage before
+        // appending the remaining code points.
+        if (!inflateChars()) {
+            return false;
+        }
+    }
+
+    UTF8Chars remainingUtf8(units, len);
+
+    // Determine how many UTF-16 code units are required to represent the
+    // remaining units.
+    size_t utf16Len = 0;
+    auto countInflated = [&utf16Len](char16_t c) -> LoopDisposition {
+        utf16Len++;
+        return LoopDisposition::Continue;
+    };
+    if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, remainingUtf8, countInflated)) {
+        return false;
+    }
+
+    TwoByteCharBuffer& buf = twoByteChars();
+
+    size_t i = buf.length();
+    if (!buf.growByUninitialized(utf16Len)) {
+        return false;
+    }
+    MOZ_ASSERT(i + utf16Len == buf.length(),
+               "growByUninitialized assumed to increase length immediately");
+
+    char16_t* toFill = &buf[i];
+    auto appendUtf16 = [&toFill](char16_t unit) {
+        *toFill++ = unit;
+        return LoopDisposition::Continue;
+    };
+
+    MOZ_ALWAYS_TRUE(InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, remainingUtf8, appendUtf16));
+    MOZ_ASSERT(toFill == buf.end());
+    return true;
+}
--- a/js/src/vm/JSScript.cpp
+++ b/js/src/vm/JSScript.cpp
@@ -1827,27 +1827,37 @@ bool
 ScriptSource::appendSubstring(JSContext* cx, StringBuffer& buf, size_t start, size_t stop)
 {
     MOZ_ASSERT(start <= stop);
 
     size_t len = stop - start;
     UncompressedSourceCache::AutoHoldEntry holder;
 
     if (hasSourceType<Utf8Unit>()) {
-        MOZ_CRASH("for now");
-        return false;
-    } else {
-        PinnedUnits<char16_t> units(cx, this, holder, start, len);
-        if (!units.asChars()) {
+        PinnedUnits<Utf8Unit> pinned(cx, this, holder, start, len);
+        if (!pinned.get()) {
             return false;
         }
         if (len > SourceDeflateLimit && !buf.ensureTwoByteChars()) {
             return false;
         }
-        return buf.append(units.asChars(), len);
+
+        const Utf8Unit* units = pinned.get();
+        return buf.append(units, len);
+    } else {
+        PinnedUnits<char16_t> pinned(cx, this, holder, start, len);
+        if (!pinned.get()) {
+            return false;
+        }
+        if (len > SourceDeflateLimit && !buf.ensureTwoByteChars()) {
+            return false;
+        }
+
+        const char16_t* units = pinned.get();
+        return buf.append(units, len);
     }
 }
 
 JSFlatString*
 ScriptSource::functionBodyString(JSContext* cx)
 {
     MOZ_ASSERT(isFunctionBody());