Bug 1494942 - Improve AtomizeUTF8Chars performance. (r=Waldo)
authorEric Faust <efausbmo@gmail.com>
Mon, 05 Nov 2018 17:37:51 -0800
changeset 444480 c589c495c24bee0597dcc98e22b3102e01856111
parent 444479 cbe76051498af8c9ba731d973f7eeceda52d3452
child 444481 2aaf8a7930e51d87c80d8c1824337db17b9de430
push id109598
push userefaustbmo@gmail.com
push dateTue, 06 Nov 2018 02:54:04 +0000
treeherdermozilla-inbound@c589c495c24b [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersWaldo
bugs1494942
milestone65.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1494942 - Improve AtomizeUTF8Chars performance. (r=Waldo)
js/src/vm/CharacterEncoding.cpp
js/src/vm/JSAtom.cpp
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@@ -372,16 +372,38 @@ InflateUTF8ToUTF16(JSContext* cx, const 
             // code unit.
             i += n - 1;
         }
     }
 
     return true;
 }
 
+template <OnUTF8Error ErrorAction, typename CharT>
+static void
+CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src, CharT *dst, size_t outlen, bool allASCII)
+{
+    if (allASCII) {
+        size_t srclen = src.length();
+        MOZ_ASSERT(outlen == srclen);
+        for (uint32_t i = 0; i < srclen; i++) {
+            dst[i] = CharT(src[i]);
+        }
+    } else {
+        size_t j = 0;
+        auto push = [dst, &j](char16_t c) -> LoopDisposition {
+            dst[j++] = CharT(c);
+            return LoopDisposition::Continue;
+        };
+        MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<ErrorAction>(cx, src, push)));
+        MOZ_ASSERT(j == outlen);
+    }
+    dst[outlen] = CharT('\0');    // NUL char
+}
+
 template <OnUTF8Error ErrorAction, typename CharsT>
 static CharsT
 InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
 {
     using CharT = typename CharsT::CharT;
     static_assert(std::is_same<CharT, char16_t>::value ||
                   std::is_same<CharT, Latin1Char>::value,
                   "bad CharT");
@@ -401,35 +423,20 @@ InflateUTF8StringHelper(JSContext* cx, c
     *outlen = len;
 
     CharT* dst = cx->template pod_malloc<CharT>(*outlen + 1);  // +1 for NUL
     if (!dst) {
         ReportOutOfMemory(cx);
         return CharsT();
     }
 
-    if (allASCII) {
-        size_t srclen = src.length();
-        MOZ_ASSERT(*outlen == srclen);
-        for (uint32_t i = 0; i < srclen; i++) {
-            dst[i] = CharT(src[i]);
-        }
-    } else {
-        constexpr OnUTF8Error errorMode = std::is_same<CharT, Latin1Char>::value
-            ? OnUTF8Error::InsertQuestionMark
-            : OnUTF8Error::InsertReplacementCharacter;
-        size_t j = 0;
-        auto push = [dst, &j](char16_t c) -> LoopDisposition {
-            dst[j++] = CharT(c);
-            return LoopDisposition::Continue;
-        };
-        MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<errorMode>(cx, src, push)));
-        MOZ_ASSERT(j == len);
-    }
-    dst[*outlen] = 0;    // NUL char
+    constexpr OnUTF8Error errorMode = std::is_same<CharT, Latin1Char>::value
+        ? OnUTF8Error::InsertQuestionMark
+        : OnUTF8Error::InsertReplacementCharacter;
+    CopyAndInflateUTF8IntoBuffer<errorMode>(cx, src, dst, *outlen, allASCII);
 
     return CharsT(dst, *outlen);
 }
 
 TwoByteCharsZ
 JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
 {
     return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(cx, utf8, outlen);
@@ -450,30 +457,41 @@ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSC
 
 TwoByteCharsZ
 JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen)
 {
     UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
     return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter, TwoByteCharsZ>(cx, chars, outlen);
 }
 
+static void
+UpdateSmallestEncodingForChar(char16_t c, JS::SmallestEncoding* encoding)
+{
+    JS::SmallestEncoding newEncoding = JS::SmallestEncoding::ASCII;
+    if (c >= 0x80) {
+        if (c < 0x100) {
+            newEncoding = JS::SmallestEncoding::Latin1;
+        } else {
+            newEncoding = JS::SmallestEncoding::UTF16;
+        }
+    }
+    if (newEncoding > *encoding) {
+        *encoding = newEncoding;
+    }
+}
+
 JS::SmallestEncoding
 JS::FindSmallestEncoding(UTF8Chars utf8)
 {
     JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
     auto onChar = [&encoding](char16_t c) -> LoopDisposition {
-        if (c >= 0x80) {
-            if (c < 0x100) {
-                encoding = JS::SmallestEncoding::Latin1;
-            } else {
-                encoding = JS::SmallestEncoding::UTF16;
-                return LoopDisposition::Break;
-            }
-        }
-        return LoopDisposition::Continue;
+        UpdateSmallestEncodingForChar(c, &encoding);
+        return encoding == JS::SmallestEncoding::UTF16
+               ? LoopDisposition::Break
+               : LoopDisposition::Continue;
     };
     MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<OnUTF8Error::InsertReplacementCharacter>(
                          /* cx = */ nullptr, utf8, onChar)));
     return encoding;
 }
 
 Latin1CharsZ
 JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
@@ -482,16 +500,96 @@ JS::UTF8CharsToNewLatin1CharsZ(JSContext
 }
 
 Latin1CharsZ
 JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
 {
     return InflateUTF8StringHelper<OnUTF8Error::InsertQuestionMark, Latin1CharsZ>(cx, utf8, outlen);
 }
 
+/**
+ * Atomization Helpers.
+ *
+ * These functions are extremely single-use, and are not intended for general
+ * consumption.
+ */
+
+bool
+GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen,
+                       JS::SmallestEncoding* encoding, HashNumber* hashNum)
+{
+    *outlen = 0;
+    *encoding = JS::SmallestEncoding::ASCII;
+    *hashNum = 0;
+
+    auto getMetadata = [outlen, encoding, hashNum](char16_t c) -> LoopDisposition {
+        (*outlen)++;
+        UpdateSmallestEncodingForChar(c, encoding);
+        *hashNum = mozilla::AddToHash(*hashNum, c);
+        return LoopDisposition::Continue;
+    };
+    if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, utf8, getMetadata)) {
+        return false;
+    }
+
+    return true;
+}
+
+template <typename CharT>
+bool
+UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars)
+{
+    size_t ind = 0;
+    bool isEqual = true;
+
+    auto checkEqual = [&isEqual, &ind, chars](char16_t c) -> LoopDisposition {
+    #ifdef DEBUG
+        JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
+        UpdateSmallestEncodingForChar(c, &encoding);
+        if (std::is_same<CharT, JS::Latin1Char>::value) {
+            MOZ_ASSERT(encoding <= JS::SmallestEncoding::Latin1);
+        } else if (!std::is_same<CharT, char16_t>::value) {
+            MOZ_CRASH("Invalid character type in UTF8EqualsChars");
+        }
+    #endif
+
+        if (CharT(c) != chars[ind]) {
+            isEqual = false;
+            return LoopDisposition::Break;
+        }
+
+        ind++;
+        return LoopDisposition::Continue;
+    };
+
+    // To get here, you must have checked your work.
+    InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, utfChars, checkEqual);
+
+    return isEqual;
+}
+
+template bool UTF8EqualsChars<char16_t>(const JS::UTF8Chars, const char16_t*);
+template bool UTF8EqualsChars<JS::Latin1Char>(const JS::UTF8Chars, const JS::Latin1Char*);
+
+template <typename CharT>
+void
+InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
+                                     JS::SmallestEncoding encoding)
+{
+    CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>(/* cx = */ nullptr, src, dst, dstLen,
+                                                     encoding == JS::SmallestEncoding::ASCII);
+}
+
+template void
+InflateUTF8CharsToBufferAndTerminate<char16_t>(const UTF8Chars src, char16_t* dst, size_t dstLen,
+                                               JS::SmallestEncoding encoding);
+template void
+InflateUTF8CharsToBufferAndTerminate<JS::Latin1Char>(const UTF8Chars src, JS::Latin1Char* dst,
+                                                     size_t dstLen, JS::SmallestEncoding encoding);
+
 #ifdef DEBUG
 void
 JS::ConstUTF8CharsZ::validate(size_t aLength)
 {
     MOZ_ASSERT(data_);
     UTF8Chars chars(data_, aLength);
     auto nop = [](char16_t) -> LoopDisposition { return LoopDisposition::Continue; };
     InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, chars, nop);
--- a/js/src/vm/JSAtom.cpp
+++ b/js/src/vm/JSAtom.cpp
@@ -36,47 +36,69 @@
 using namespace js;
 
 using mozilla::ArrayEnd;
 using mozilla::ArrayLength;
 using mozilla::Maybe;
 using mozilla::Nothing;
 using mozilla::RangedPtr;
 
+template <typename CharT>
+extern void InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
+                                                 JS::SmallestEncoding encoding);
+
+template <typename CharT>
+extern bool UTF8EqualsChars(const JS::UTF8Chars utf8, const CharT* chars);
+
+extern bool
+GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen, JS::SmallestEncoding* encoding,
+                       HashNumber* hashNum);
+
 struct js::AtomHasher::Lookup
 {
     union {
         const JS::Latin1Char* latin1Chars;
         const char16_t* twoByteChars;
+        const char* utf8Bytes;
     };
-    bool isLatin1;
+    enum {
+        TwoByteChar,
+        Latin1,
+        UTF8
+    } type;
     size_t length;
+    size_t byteLength;
     const JSAtom* atom; /* Optional. */
     JS::AutoCheckCannotGC nogc;
 
     HashNumber hash;
 
+    MOZ_ALWAYS_INLINE Lookup(const char* utf8Bytes, size_t byteLen, size_t length, HashNumber hash)
+      : utf8Bytes(utf8Bytes), type(UTF8), length(length), byteLength(byteLen), atom(nullptr), hash(hash)
+    {}
+
     MOZ_ALWAYS_INLINE Lookup(const char16_t* chars, size_t length)
-      : twoByteChars(chars), isLatin1(false), length(length), atom(nullptr),
+      : twoByteChars(chars), type(TwoByteChar), length(length), atom(nullptr),
         hash(mozilla::HashString(chars, length))
     {}
 
     MOZ_ALWAYS_INLINE Lookup(const JS::Latin1Char* chars, size_t length)
-      : latin1Chars(chars), isLatin1(true), length(length), atom(nullptr),
+      : latin1Chars(chars), type(Latin1), length(length), atom(nullptr),
         hash(mozilla::HashString(chars, length))
     {}
 
     inline explicit Lookup(const JSAtom* atom)
-      : isLatin1(atom->hasLatin1Chars()), length(atom->length()), atom(atom),
+      : type(atom->hasLatin1Chars() ? Latin1 : TwoByteChar), length(atom->length()), atom(atom),
         hash(atom->hash())
     {
-        if (isLatin1) {
+        if (type == Latin1) {
             latin1Chars = atom->latin1Chars(nogc);
             MOZ_ASSERT(mozilla::HashString(latin1Chars, length) == hash);
         } else {
+            MOZ_ASSERT(type == TwoByteChar);
             twoByteChars = atom->twoByteChars(nogc);
             MOZ_ASSERT(mozilla::HashString(twoByteChars, length) == hash);
         }
     }
 };
 
 inline HashNumber
 js::AtomHasher::hash(const Lookup& l)
@@ -92,27 +114,42 @@ js::AtomHasher::match(const AtomStateEnt
         return lookup.atom == key;
     }
     if (key->length() != lookup.length || key->hash() != lookup.hash) {
         return false;
     }
 
     if (key->hasLatin1Chars()) {
         const Latin1Char* keyChars = key->latin1Chars(lookup.nogc);
-        if (lookup.isLatin1) {
+        switch (lookup.type) {
+          case Lookup::Latin1:
             return mozilla::ArrayEqual(keyChars, lookup.latin1Chars, lookup.length);
+          case Lookup::TwoByteChar:
+            return EqualChars(keyChars, lookup.twoByteChars, lookup.length);
+          case Lookup::UTF8: {
+            JS::UTF8Chars utf8(lookup.utf8Bytes, lookup.byteLength);
+            return UTF8EqualsChars(utf8, keyChars);
+          }
         }
-        return EqualChars(keyChars, lookup.twoByteChars, lookup.length);
     }
 
     const char16_t* keyChars = key->twoByteChars(lookup.nogc);
-    if (lookup.isLatin1) {
+    switch (lookup.type) {
+      case Lookup::Latin1:
         return EqualChars(lookup.latin1Chars, keyChars, lookup.length);
+      case Lookup::TwoByteChar:
+        return mozilla::ArrayEqual(keyChars, lookup.twoByteChars, lookup.length);
+      case Lookup::UTF8: {
+        JS::UTF8Chars utf8(lookup.utf8Bytes, lookup.byteLength);
+        return UTF8EqualsChars(utf8, keyChars);
+      }
     }
-    return mozilla::ArrayEqual(keyChars, lookup.twoByteChars, lookup.length);
+
+    MOZ_ASSERT_UNREACHABLE("AtomHasher::match unknown type");
+    return false;
 }
 
 inline JSAtom*
 js::AtomStateEntry::asPtr(JSContext* cx) const
 {
     JSAtom* atom = asPtrUnbarriered();
     if (!cx->helperThread()) {
         JSString::readBarrier(atom);
@@ -615,29 +652,44 @@ PermanentlyAtomizeAndCopyChars(JSContext
                                const Maybe<uint32_t>& indexValue,
                                const AtomHasher::Lookup& lookup);
 
 template <typename CharT>
 MOZ_ALWAYS_INLINE static JSAtom*
 AllocateNewAtom(JSContext* cx, const CharT* tbchars, size_t length, PinningBehavior pin,
                 const Maybe<uint32_t>& indexValue, const AtomHasher::Lookup& lookup);
 
+template <typename CharT>
+MOZ_ALWAYS_INLINE
+static JSAtom*
+AtomizeAndCopyCharsFromLookup(JSContext* cx, const CharT* tbchars, size_t length, const AtomHasher::Lookup& lookup,
+                              PinningBehavior pin, const Maybe<uint32_t>& indexValue);
+
 /* |tbchars| must not point into an inline or short string. */
 template <typename CharT>
 MOZ_ALWAYS_INLINE
 static JSAtom*
 AtomizeAndCopyChars(JSContext* cx, const CharT* tbchars, size_t length, PinningBehavior pin,
                     const Maybe<uint32_t>& indexValue)
 {
     if (JSAtom* s = cx->staticStrings().lookup(tbchars, length)) {
         return s;
     }
 
     AtomHasher::Lookup lookup(tbchars, length);
+    return AtomizeAndCopyCharsFromLookup(cx, tbchars, length, lookup, pin, indexValue);
+}
 
+
+template <typename CharT>
+MOZ_ALWAYS_INLINE
+static JSAtom*
+AtomizeAndCopyCharsFromLookup(JSContext* cx, const CharT* tbchars, size_t length, const AtomHasher::Lookup& lookup,
+                              PinningBehavior pin, const Maybe<uint32_t>& indexValue)
+{
     // Try the per-Zone cache first. If we find the atom there we can avoid the
     // atoms lock, the markAtom call, and the multiple HashSet lookups below.
     // We don't use the per-Zone cache if we want a pinned atom: handling that
     // is more complicated and pinning atoms is relatively uncommon.
     Zone* zone = cx->zone();
     Maybe<AtomSet::AddPtr> zonePtr;
     if (MOZ_LIKELY(zone && pin == DoNotPinAtom)) {
         zonePtr.emplace(zone->atomCache().lookupForAdd(lookup));
@@ -805,24 +857,93 @@ PermanentlyAtomizeAndCopyChars(JSContext
     {
         ReportOutOfMemory(cx);
         return nullptr;
     }
 
     return atom;
 }
 
+struct AtomizeUTF8CharsWrapper
+{
+    JS::UTF8Chars utf8;
+    JS::SmallestEncoding encoding;
+
+    AtomizeUTF8CharsWrapper(const JS::UTF8Chars& chars, JS::SmallestEncoding minEncode)
+      : utf8(chars), encoding(minEncode)
+    { }
+};
+
+template <typename CharT>
+MOZ_ALWAYS_INLINE
+static JSFlatString*
+MakeFlatStringForAtomization(JSContext* cx, const CharT* tbchars, size_t length)
+{
+    return NewStringCopyN<NoGC>(cx, tbchars, length);
+}
+
+template<typename CharT>
+MOZ_ALWAYS_INLINE
+static JSFlatString*
+MakeUTF8AtomHelper(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
+{
+    if (JSInlineString::lengthFits<CharT>(length)) {
+        CharT* storage;
+        JSInlineString* str = AllocateInlineString<NoGC>(cx, length, &storage);
+        if (!str) {
+            return nullptr;
+        }
+
+        InflateUTF8CharsToBufferAndTerminate(chars->utf8, storage, length, chars->encoding);
+        return str;
+    }
+
+    // MakeAtomUTF8Helper is called from deep in the Atomization path, which expects
+    // functions to fail gracefully with nullptr on OOM, without throwing.
+    //
+    // Flat strings are null-terminated. Leave room with length + 1
+    UniquePtr<CharT[], JS::FreePolicy> newStr(js_pod_malloc<CharT>(length + 1));
+    if (!newStr) {
+        return nullptr;
+    }
+
+    InflateUTF8CharsToBufferAndTerminate(chars->utf8, newStr.get(), length, chars->encoding);
+
+    JSFlatString* str = JSFlatString::new_<NoGC>(cx, newStr.get(), length);
+    if (!str) {
+        return nullptr;
+    }
+
+    mozilla::Unused << newStr.release();
+    return str;
+}
+
+template<>
+MOZ_ALWAYS_INLINE
+/* static */ JSFlatString*
+MakeFlatStringForAtomization(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
+{
+    if (length == 0) {
+        return cx->emptyString();
+    }
+
+    if (chars->encoding == JS::SmallestEncoding::UTF16) {
+        return MakeUTF8AtomHelper<char16_t>(cx, chars, length);
+    }
+    return MakeUTF8AtomHelper<JS::Latin1Char>(cx, chars, length);
+}
+
 template <typename CharT>
 MOZ_ALWAYS_INLINE static JSAtom*
 AllocateNewAtom(JSContext* cx, const CharT* tbchars, size_t length, PinningBehavior pin,
                 const Maybe<uint32_t>& indexValue, const AtomHasher::Lookup& lookup)
 {
     AutoAllocInAtomsZone ac(cx);
 
-    JSFlatString* flat = NewStringCopyN<NoGC>(cx, tbchars, length);
+    JSFlatString* flat = MakeFlatStringForAtomization(cx, tbchars, length);
     if (!flat) {
         // Grudgingly forgo last-ditch GC. The alternative would be to release
         // the lock, manually GC here, and retry from the top. If you fix this,
         // please also fix or comment the similar case in Symbol::new_.
         ReportOutOfMemory(cx);
         return nullptr;
     }
 
@@ -914,29 +1035,32 @@ template JSAtom*
 js::AtomizeChars(JSContext* cx, const Latin1Char* chars, size_t length, PinningBehavior pin);
 
 template JSAtom*
 js::AtomizeChars(JSContext* cx, const char16_t* chars, size_t length, PinningBehavior pin);
 
 JSAtom*
 js::AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength)
 {
-    // This could be optimized to hand the char16_t's directly to the JSAtom
-    // instead of making a copy. UTF8CharsToNewTwoByteCharsZ should be
-    // refactored to take an JSContext so that this function could also.
-
-    UTF8Chars utf8(utf8Chars, utf8ByteLength);
+    // Since the static strings are all ascii, we can check them before trying anything else.
+    if (JSAtom* s = cx->staticStrings().lookup(utf8Chars, utf8ByteLength)) {
+        return s;
+    }
 
     size_t length;
-    UniqueTwoByteChars chars(JS::UTF8CharsToNewTwoByteCharsZ(cx, utf8, &length).get());
-    if (!chars) {
+    HashNumber hash;
+    JS::SmallestEncoding forCopy;
+    UTF8Chars utf8(utf8Chars, utf8ByteLength);
+    if (!GetUTF8AtomizationData(cx, utf8, &length, &forCopy, &hash)) {
         return nullptr;
     }
 
-    return AtomizeChars(cx, chars.get(), length);
+    AtomizeUTF8CharsWrapper chars(utf8, forCopy);
+    AtomHasher::Lookup lookup(utf8Chars, utf8ByteLength, length, hash);
+    return AtomizeAndCopyCharsFromLookup(cx, &chars, length, lookup, DoNotPinAtom, Nothing());
 }
 
 bool
 js::IndexToIdSlow(JSContext* cx, uint32_t index, MutableHandleId idp)
 {
     MOZ_ASSERT(index > JSID_INT_MAX);
 
     char16_t buf[UINT32_CHAR_BUFFER_LENGTH];