Bug 1135377 - Part 9: Use RegExp unicode flag in String.prototype.{match,replace,split}. r=till, f=anba
authorTooru Fujisawa <arai_a@mac.com>
Fri, 07 Aug 2015 08:13:37 +0900
changeset 277064 5b0ced0284a9e13609fad337abc442a290ee30de
parent 277063 3bd3d3ed5fe4ffc440f6e9ae2d2161481034daae
child 277065 69c495efe7991438ce7aaeabd9367bb66d0ceccb
push id16724
push usercbook@mozilla.com
push dateMon, 21 Dec 2015 11:00:52 +0000
treeherderfx-team@3f3f0361567c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstill
bugs1135377
milestone46.0a1
Bug 1135377 - Part 9: Use RegExp unicode flag in String.prototype.{match,replace,split}. r=till, f=anba
js/src/jsstr.cpp
js/src/tests/ecma_6/String/AdvanceStringIndex.js
--- a/js/src/jsstr.cpp
+++ b/js/src/jsstr.cpp
@@ -2271,16 +2271,51 @@ DoMatchLocal(JSContext* cx, const CallAr
     RootedValue rval(cx);
     if (!CreateRegExpMatchResult(cx, input, matches, &rval))
         return false;
 
     args.rval().set(rval);
     return true;
 }
 
+/* ES6 21.2.5.2.3. */
+static size_t
+AdvanceStringIndex(HandleLinearString input, size_t length, size_t index, bool unicode)
+{
+    /* Steps 1-3 (implicit). */
+
+    /* Step 4: If input is latin1, there is no surrogate pair. */
+    if (!unicode || input->hasLatin1Chars())
+        return index + 1;
+
+    JS::AutoCheckCannotGC nogc;
+    const char16_t* S = input->twoByteChars(nogc);
+
+    /* Step 6. */
+    if (index + 1 >= length)
+        return index + 1;
+
+    /* Step 7. */
+    char16_t first = S[index];
+
+    /* Step 8. */
+    if (!unicode::IsLeadSurrogate(first))
+        return index + 1;
+
+    /* Step 9. */
+    char16_t second = S[index + 1];
+
+    /* Step 10. */
+    if (!unicode::IsTrailSurrogate(second))
+        return index + 1;
+
+    /* Step 11. */
+    return index + 2;
+}
+
 /* ES5 15.5.4.10 step 8. */
 static bool
 DoMatchGlobal(JSContext* cx, const CallArgs& args, RegExpStatics* res, HandleLinearString input,
               StringRegExpGuard& g)
 {
     // Step 8a.
     //
     // This single zeroing of "lastIndex" covers all "lastIndex" changes in the
@@ -2326,16 +2361,17 @@ DoMatchGlobal(JSContext* cx, const CallA
 
     // The loop variables from steps 8c-e aren't needed, as we use different
     // techniques from the spec to implement step 8f's loop.
 
     // Step 8f.
     ScopedMatchPairs matches(&cx->tempLifoAlloc());
     size_t charsLen = input->length();
     RegExpShared& re = g.regExp();
+    bool unicode = re.unicode();
     for (size_t searchIndex = 0; searchIndex <= charsLen; ) {
         if (!CheckForInterrupt(cx))
             return false;
 
         // Steps 8f(i-ii), minus "lastIndex" updates (see above).
         RegExpRunStatus status = re.execute(cx, input, searchIndex, &matches);
         if (status == RegExpRunStatus_Error)
             return false;
@@ -2343,17 +2379,19 @@ DoMatchGlobal(JSContext* cx, const CallA
         // Step 8f(ii).
         if (status == RegExpRunStatus_Success_NotFound)
             break;
 
         lastSuccessfulStart = searchIndex;
         MatchPair& match = matches[0];
 
         // Steps 8f(iii)(1-3).
-        searchIndex = match.isEmpty() ? match.limit + 1 : match.limit;
+        searchIndex = match.isEmpty()
+                      ? AdvanceStringIndex(input, charsLen, match.limit, unicode)
+                      : match.limit;
 
         // Step 8f(iii)(4-5).
         JSLinearString* str = NewDependentString(cx, input, match.start, match.length());
         if (!str)
             return false;
         if (!elements.append(StringValue(str)))
             return false;
     }
@@ -2609,31 +2647,34 @@ DoMatchForReplaceLocal(JSContext* cx, Re
 
     return ReplaceRegExp(cx, res, rdata);
 }
 
 static bool
 DoMatchForReplaceGlobal(JSContext* cx, RegExpStatics* res, HandleLinearString linearStr,
                         RegExpShared& re, ReplaceData& rdata, size_t* rightContextOffset)
 {
+    bool unicode = re.unicode();
     size_t charsLen = linearStr->length();
     ScopedMatchPairs matches(&cx->tempLifoAlloc());
     for (size_t count = 0, searchIndex = 0; searchIndex <= charsLen; ++count) {
         if (!CheckForInterrupt(cx))
             return false;
 
         RegExpRunStatus status = re.execute(cx, linearStr, searchIndex, &matches);
         if (status == RegExpRunStatus_Error)
             return false;
 
         if (status == RegExpRunStatus_Success_NotFound)
             break;
 
         MatchPair& match = matches[0];
-        searchIndex = match.isEmpty() ? match.limit + 1 : match.limit;
+        searchIndex = match.isEmpty()
+                      ? AdvanceStringIndex(linearStr, charsLen, match.limit, unicode)
+                      : match.limit;
         *rightContextOffset = match.limit;
 
         if (!res->updateFromMatchPairs(cx, linearStr, matches))
             return false;
 
         if (!ReplaceRegExp(cx, res, rdata))
             return false;
     }
@@ -3223,16 +3264,17 @@ StrReplaceRegexpRemove(JSContext* cx, Ha
     size_t charsLen = linearStr->length();
 
     ScopedMatchPairs matches(&cx->tempLifoAlloc());
     size_t startIndex = 0; /* Index used for iterating through the string. */
     size_t lastIndex = 0;  /* Index after last successful match. */
     size_t lazyIndex = 0;  /* Index before last successful match. */
 
     /* Accumulate StringRanges for unmatched substrings. */
+    bool unicode = re.unicode();
     while (startIndex <= charsLen) {
         if (!CheckForInterrupt(cx))
             return nullptr;
 
         RegExpRunStatus status = re.execute(cx, linearStr, startIndex, &matches);
         if (status == RegExpRunStatus_Error)
             return nullptr;
         if (status == RegExpRunStatus_Success_NotFound)
@@ -3243,17 +3285,19 @@ StrReplaceRegexpRemove(JSContext* cx, Ha
         if (size_t(match.start) > lastIndex) {
             if (!ranges.append(StringRange(lastIndex, match.start - lastIndex)))
                 return nullptr;
         }
 
         lazyIndex = lastIndex;
         lastIndex = match.limit;
 
-        startIndex = match.isEmpty() ? match.limit + 1 : match.limit;
+        startIndex = match.isEmpty()
+                     ? AdvanceStringIndex(linearStr, charsLen, match.limit, unicode)
+                     : match.limit;
 
         /* Non-global removal executes at most once. */
         if (!re.global())
             break;
     }
 
     RegExpStatics* res;
 
@@ -3626,17 +3670,17 @@ class SplitMatchResult {
     }
 };
 
 } /* anonymous namespace */
 
 template<class Matcher>
 static JSObject*
 SplitHelper(JSContext* cx, HandleLinearString str, uint32_t limit, const Matcher& splitMatch,
-            HandleObjectGroup group)
+            HandleObjectGroup group, bool unicode)
 {
     size_t strLength = str->length();
     SplitMatchResult result;
 
     /* Step 11. */
     if (strLength == 0) {
         if (!splitMatch(cx, str, 0, &result))
             return nullptr;
@@ -3691,17 +3735,17 @@ SplitHelper(JSContext* cx, HandleLinearS
         /* Step 13(c)(i). */
         size_t sepLength = result.length();
         size_t endIndex = result.endIndex();
         if (sepLength == 0 && endIndex == strLength)
             break;
 
         /* Step 13(c)(ii). */
         if (endIndex == lastEndIndex) {
-            index++;
+            index = AdvanceStringIndex(str, strLength, index, unicode);
             continue;
         }
 
         /* Step 13(c)(iii). */
         MOZ_ASSERT(lastEndIndex < endIndex);
         MOZ_ASSERT(sepLength <= strLength);
         MOZ_ASSERT(lastEndIndex + sepLength <= endIndex);
 
@@ -3920,24 +3964,24 @@ js::str_split(JSContext* cx, unsigned ar
 
     /* Steps 11-15. */
     RootedObject aobj(cx);
     if (!re.initialized()) {
         if (sepstr->length() == 0) {
             aobj = CharSplitHelper(cx, linearStr, limit, group);
         } else {
             SplitStringMatcher matcher(cx, sepstr);
-            aobj = SplitHelper(cx, linearStr, limit, matcher, group);
+            aobj = SplitHelper(cx, linearStr, limit, matcher, group, false);
         }
     } else {
         RegExpStatics* res = cx->global()->getRegExpStatics(cx);
         if (!res)
             return false;
         SplitRegExpMatcher matcher(*re, res);
-        aobj = SplitHelper(cx, linearStr, limit, matcher, group);
+        aobj = SplitHelper(cx, linearStr, limit, matcher, group, re->unicode());
     }
     if (!aobj)
         return false;
 
     /* Step 16. */
     MOZ_ASSERT(aobj->group() == group);
     args.rval().setObject(*aobj);
     return true;
@@ -3955,17 +3999,17 @@ js::str_split_string(JSContext* cx, Hand
         return nullptr;
 
     uint32_t limit = UINT32_MAX;
 
     if (linearSep->length() == 0)
         return CharSplitHelper(cx, linearStr, limit, group);
 
     SplitStringMatcher matcher(cx, linearSep);
-    return SplitHelper(cx, linearStr, limit, matcher, group);
+    return SplitHelper(cx, linearStr, limit, matcher, group, false);
 }
 
 /*
  * Python-esque sequence operations.
  */
 static bool
 str_concat(JSContext* cx, unsigned argc, Value* vp)
 {
new file mode 100644
--- /dev/null
+++ b/js/src/tests/ecma_6/String/AdvanceStringIndex.js
@@ -0,0 +1,43 @@
+var BUGNUMBER = 1135377;
+var summary = "Implement RegExp unicode flag -- AdvanceStringIndex in global match and replace.";
+
+print(BUGNUMBER + ": " + summary);
+
+// ==== String.prototype.match ====
+
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".match(/\uD83D|X|/gu),
+              ["", "", "X", "", ""]);
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".match(/\uDC38|X|/gu),
+              ["", "", "X", "", ""]);
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".match(/\uD83D\uDC38|X|/gu),
+              ["\uD83D\uDC38", "", "X", "", ""]);
+
+// ==== String.prototype.replace ====
+
+// empty string replacement (optimized)
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".replace(/\uD83D|X|/gu, ""),
+              "\uD83D\uDC38\uD83D\uDC39\uD83D\uDC3A");
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".replace(/\uDC38|X|/gu, ""),
+              "\uD83D\uDC38\uD83D\uDC39\uD83D\uDC3A");
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".replace(/\uD83D\uDC38|X|/gu, ""),
+              "\uD83D\uDC39\uD83D\uDC3A");
+
+// non-empty string replacement
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".replace(/\uD83D|X|/gu, "x"),
+              "x\uD83D\uDC38x\uD83D\uDC39xx\uD83D\uDC3Ax");
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".replace(/\uDC38|X|/gu, "x"),
+              "x\uD83D\uDC38x\uD83D\uDC39xx\uD83D\uDC3Ax");
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".replace(/\uD83D\uDC38|X|/gu, "x"),
+              "xx\uD83D\uDC39xx\uD83D\uDC3Ax");
+
+// ==== String.prototype.split ====
+
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".split(/\uD83D|X|/u),
+              ["\uD83D\uDC38", "\uD83D\uDC39", "\uD83D\uDC3A"]);
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".split(/\uDC38|X|/u),
+              ["\uD83D\uDC38", "\uD83D\uDC39", "\uD83D\uDC3A"]);
+assertEqArray("\uD83D\uDC38\uD83D\uDC39X\uD83D\uDC3A".split(/\uD83D\uDC38|X|/u),
+              ["", "\uD83D\uDC39", "\uD83D\uDC3A"]);
+
+if (typeof reportCompare === "function")
+    reportCompare(true, true);