Bug 1319465 - Use the newer ICU string normalization API to improve performance for partially normalized strings. r=arai
authorAndré Bargull <andre.bargull@gmail.com>
Tue, 22 Nov 2016 09:20:18 -0800
changeset 324136 061d9d4d64ca7c21431c879ad0332aa5544cfe1c
parent 324135 968fe7fa5d930f46e8b2caf2e24e316beed02f5d
child 324137 ee43b0af2a45d0bd9096f06e4aa5ea5c792c2ddc
push id24
push usermaklebus@msu.edu
push dateTue, 20 Dec 2016 03:11:33 +0000
reviewersarai
bugs1319465
milestone53.0a1
Bug 1319465 - Use the newer ICU string normalization API to improve performance for partially normalized strings. r=arai
config/check_spidermonkey_style.py
js/src/jsstr.cpp
--- a/config/check_spidermonkey_style.py
+++ b/config/check_spidermonkey_style.py
@@ -82,17 +82,17 @@ included_inclnames_to_ignore = set([
     'shellmoduleloader.out.h',  # generated in $OBJDIR
     'unicode/timezone.h',       # ICU
     'unicode/ucal.h',           # ICU
     'unicode/uclean.h',         # ICU
     'unicode/ucol.h',           # ICU
     'unicode/udat.h',           # ICU
     'unicode/udatpg.h',         # ICU
     'unicode/uenum.h',          # ICU
-    'unicode/unorm.h',          # ICU
+    'unicode/unorm2.h',         # ICU
     'unicode/unum.h',           # ICU
     'unicode/unumsys.h',        # ICU
     'unicode/ustring.h',        # ICU
     'unicode/utypes.h',         # ICU
     'vtune/VTuneWrapper.h'      # VTune
 ])
 
 # These files have additional constraints on where they are #included, so we
--- a/js/src/jsstr.cpp
+++ b/js/src/jsstr.cpp
@@ -32,17 +32,17 @@
 #include "jsutil.h"
 
 #include "builtin/Intl.h"
 #include "builtin/RegExp.h"
 #include "jit/InlinableNatives.h"
 #include "js/Conversions.h"
 #include "js/UniquePtr.h"
 #if ENABLE_INTL_API
-#include "unicode/unorm.h"
+#include "unicode/unorm2.h"
 #endif
 #include "vm/GlobalObject.h"
 #include "vm/Interpreter.h"
 #include "vm/Opcodes.h"
 #include "vm/Printer.h"
 #include "vm/RegExpObject.h"
 #include "vm/RegExpStatics.h"
 #include "vm/StringBuffer.h"
@@ -926,89 +926,151 @@ js::str_localeCompare(JSContext* cx, uns
         return false;
 
     args.rval().setInt32(result);
     return true;
 }
 #endif
 
 #if EXPOSE_INTL_API
-/* ES6 20140210 draft 21.1.3.12. */
+// ES2017 draft rev 45e890512fd77add72cc0ee742785f9f6f6482de
+// 21.1.3.12 String.prototype.normalize ( [ form ] )
 bool
 js::str_normalize(JSContext* cx, unsigned argc, Value* vp)
 {
     CallArgs args = CallArgsFromVp(argc, vp);
 
-    // Steps 1-3.
+    // Steps 1-2.
     RootedString str(cx, ToStringForStringFunction(cx, args.thisv()));
     if (!str)
         return false;
 
-    // Step 4.
-    UNormalizationMode form;
+    enum NormalizationForm {
+        NFC, NFD, NFKC, NFKD
+    };
+
+    NormalizationForm form;
     if (!args.hasDefined(0)) {
-        form = UNORM_NFC;
+        // Step 3.
+        form = NFC;
     } else {
-        // Steps 5-6.
+        // Step 4.
         RootedLinearString formStr(cx, ArgToRootedString(cx, args, 0));
         if (!formStr)
             return false;
 
-        // Step 7.
+        // Step 5.
         if (EqualStrings(formStr, cx->names().NFC)) {
-            form = UNORM_NFC;
+            form = NFC;
         } else if (EqualStrings(formStr, cx->names().NFD)) {
-            form = UNORM_NFD;
+            form = NFD;
         } else if (EqualStrings(formStr, cx->names().NFKC)) {
-            form = UNORM_NFKC;
+            form = NFKC;
         } else if (EqualStrings(formStr, cx->names().NFKD)) {
-            form = UNORM_NFKD;
+            form = NFKD;
         } else {
             JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INVALID_NORMALIZE_FORM);
             return false;
         }
     }
 
-    // Step 8.
+    JSLinearString* linear = str->ensureLinear(cx);
+    if (!linear)
+        return false;
+
+    // Latin1 strings are already in Normalization Form C.
+    if (form == NFC && linear->hasLatin1Chars()) {
+        // Step 7.
+        args.rval().setString(str);
+        return true;
+    }
+
+    // Step 6.
     AutoStableStringChars stableChars(cx);
-    if (!str->ensureFlat(cx) || !stableChars.initTwoByte(cx, str))
+    if (!stableChars.initTwoByte(cx, linear))
         return false;
 
+    mozilla::Range<const char16_t> srcChars = stableChars.twoByteRange();
+
+    // The unorm2_getXXXInstance() methods return a shared instance which must
+    // not be deleted.
+    UErrorCode status = U_ZERO_ERROR;
+    const UNormalizer2* normalizer;
+    if (form == NFC) {
+        normalizer = unorm2_getNFCInstance(&status);
+    } else if (form == NFD) {
+        normalizer = unorm2_getNFDInstance(&status);
+    } else if (form == NFKC) {
+        normalizer = unorm2_getNFKCInstance(&status);
+    } else {
+        MOZ_ASSERT(form == NFKD);
+        normalizer = unorm2_getNFKDInstance(&status);
+    }
+    if (U_FAILURE(status)) {
+        JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR);
+        return false;
+    }
+
+    int32_t spanLength = unorm2_spanQuickCheckYes(normalizer,
+                                                  Char16ToUChar(srcChars.begin().get()),
+                                                  srcChars.length(), &status);
+    if (U_FAILURE(status)) {
+        JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR);
+        return false;
+    }
+    MOZ_ASSERT(0 <= spanLength && size_t(spanLength) <= srcChars.length());
+
+    // Return if the input string is already normalized.
+    if (size_t(spanLength) == srcChars.length()) {
+        // Step 7.
+        args.rval().setString(str);
+        return true;
+    }
+
     static const size_t INLINE_CAPACITY = 32;
 
-    const UChar* srcChars = Char16ToUChar(stableChars.twoByteRange().begin().get());
-    int32_t srcLen = AssertedCast<int32_t>(str->length());
     Vector<char16_t, INLINE_CAPACITY> chars(cx);
-    if (!chars.resize(INLINE_CAPACITY))
+    if (!chars.resize(Max(INLINE_CAPACITY, srcChars.length())))
         return false;
 
-    UErrorCode status = U_ZERO_ERROR;
-    int32_t size = unorm_normalize(srcChars, srcLen, form, 0,
-                                   Char16ToUChar(chars.begin()), INLINE_CAPACITY,
-                                   &status);
+    // Copy the already normalized prefix.
+    if (spanLength > 0)
+        PodCopy(chars.begin(), srcChars.begin().get(), size_t(spanLength));
+
+    mozilla::RangedPtr<const char16_t> remainingStart = srcChars.begin() + spanLength;
+    size_t remainingLength = srcChars.length() - size_t(spanLength);
+
+    int32_t size = unorm2_normalizeSecondAndAppend(normalizer, Char16ToUChar(chars.begin()),
+                                                   spanLength, chars.length(),
+                                                   Char16ToUChar(remainingStart.get()),
+                                                   remainingLength, &status);
     if (status == U_BUFFER_OVERFLOW_ERROR) {
+        MOZ_ASSERT(size >= 0);
         if (!chars.resize(size))
             return false;
         status = U_ZERO_ERROR;
 #ifdef DEBUG
         int32_t finalSize =
 #endif
-        unorm_normalize(srcChars, srcLen, form, 0,
-                        Char16ToUChar(chars.begin()), size,
-                        &status);
-        MOZ_ASSERT(size == finalSize || U_FAILURE(status), "unorm_normalize behaved inconsistently");
+        unorm2_normalizeSecondAndAppend(normalizer, Char16ToUChar(chars.begin()), spanLength,
+                                        chars.length(), Char16ToUChar(remainingStart.get()),
+                                        remainingLength, &status);
+        MOZ_ASSERT_IF(!U_FAILURE(status), size == finalSize);
     }
-    if (U_FAILURE(status))
+    if (U_FAILURE(status)) {
+        JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INTERNAL_INTL_ERROR);
         return false;
-
+    }
+
+    MOZ_ASSERT(size >= 0);
     JSString* ns = NewStringCopyN<CanGC>(cx, chars.begin(), size);
     if (!ns)
         return false;
 
-    // Step 9.
+    // Step 7.
     args.rval().setString(ns);
     return true;
 }
 #endif
 
 bool
 js::str_charAt(JSContext* cx, unsigned argc, Value* vp)
 {