Bug 1135377 - Part 4: Support everything Atom in RegExp with unicode flag. r=till, f=anba
authorTooru Fujisawa <arai_a@mac.com>
Fri, 07 Aug 2015 08:11:52 +0900
changeset 277059 803f23393bc4864f6cc342cf4f5469bb521387b1
parent 277058 4e05611fe3dd8f91320ed1d123bfc2032d11eabe
child 277060 872d04109a5ce42e20ab9466bf80809e90b157d1
push id16724
push usercbook@mozilla.com
push dateMon, 21 Dec 2015 11:00:52 +0000
treeherderfx-team@3f3f0361567c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstill
bugs1135377
milestone46.0a1
Bug 1135377 - Part 4: Support everything Atom in RegExp with unicode flag. r=till, f=anba
js/src/irregexp/RegExpParser.cpp
js/src/tests/ecma_6/RegExp/unicode-everything.js
--- a/js/src/irregexp/RegExpParser.cpp
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -1173,16 +1173,51 @@ TrailSurrogateAtom(LifoAlloc* alloc, cha
 {
     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
     builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
         RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
     builder->AddCharacter(value);
     return builder->ToRegExp();
 }
 
+static inline RegExpTree*
+UnicodeEverythingAtom(LifoAlloc* alloc)
+{
+    RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+    // everything except \x0a, \x0d, \u2028 and \u2029
+
+    CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+    ranges->append(CharacterRange::Range(0x0, 0x09));
+    ranges->append(CharacterRange::Range(0x0b, 0x0c));
+    ranges->append(CharacterRange::Range(0x0e, 0x2027));
+    ranges->append(CharacterRange::Range(0x202A, unicode::LeadSurrogateMin - 1));
+    ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max));
+    builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
+
+    builder->NewAlternative();
+
+    builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
+    builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
+                                       unicode::TrailSurrogateMax));
+
+    builder->NewAlternative();
+
+    builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+        RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
+    builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
+
+    builder->NewAlternative();
+
+    builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
+    builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
+
+    return builder->ToRegExp();
+}
+
 // Disjunction ::
 //   Alternative
 //   Alternative | Disjunction
 // Alternative ::
 //   [empty]
 //   Term Alternative
 // Term ::
 //   Assertion
@@ -1270,16 +1305,20 @@ RegExpParser<CharT>::ParseDisjunction()
                 multiline_ ? RegExpAssertion::END_OF_LINE :
                 RegExpAssertion::END_OF_INPUT;
             builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(assertion_type));
             continue;
           }
           case '.': {
             Advance();
             // everything except \x0a, \x0d, \u2028 and \u2029
+            if (unicode_) {
+                builder->AddAtom(UnicodeEverythingAtom(alloc));
+                break;
+            }
             CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
             CharacterRange::AddClassEscape(alloc, '.', ranges);
             RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
             builder->AddAtom(atom);
             break;
           }
           case '(': {
             SubexpressionType subexpr_type = CAPTURE;
new file mode 100644
--- /dev/null
+++ b/js/src/tests/ecma_6/RegExp/unicode-everything.js
@@ -0,0 +1,59 @@
+var BUGNUMBER = 1135377;
+var summary = "Implement RegExp unicode flag -- everything Atom.";
+
+print(BUGNUMBER + ": " + summary);
+
+// ==== standalone ====
+
+assertEqArray(/./u.exec("ABC"),
+              ["A"]);
+assertEqArray(/./u.exec("\u{1F438}BC"),
+              ["\u{1F438}"]);
+
+assertEqArray(/./u.exec("\uD83D\uDBFF"),
+              ["\uD83D"]);
+assertEqArray(/./u.exec("\uD83D\uDC00"),
+              ["\uD83D\uDC00"]);
+assertEqArray(/./u.exec("\uD83D\uDFFF"),
+              ["\uD83D\uDFFF"]);
+assertEqArray(/./u.exec("\uD83D\uE000"),
+              ["\uD83D"]);
+assertEqArray(/./u.exec("\uD83D"),
+              ["\uD83D"]);
+assertEqArray(/./u.exec("\uD83DA"),
+              ["\uD83D"]);
+
+assertEqArray(/./u.exec("\uD7FF\uDC38"),
+              ["\uD7FF"]);
+assertEqArray(/./u.exec("\uD800\uDC38"),
+              ["\uD800\uDC38"]);
+assertEqArray(/./u.exec("\uDBFF\uDC38"),
+              ["\uDBFF\uDC38"]);
+assertEqArray(/./u.exec("\uDC00\uDC38"),
+              ["\uDC00"]);
+assertEqArray(/./u.exec("\uDC38"),
+              ["\uDC38"]);
+assertEqArray(/./u.exec("A\uDC38"),
+              ["A"]);
+
+assertEqArray(/.A/u.exec("\uD7FF\uDC38A"),
+              ["\uDC38A"]);
+assertEqArray(/.A/u.exec("\uD800\uDC38A"),
+              ["\uD800\uDC38A"]);
+assertEqArray(/.A/u.exec("\uDBFF\uDC38A"),
+              ["\uDBFF\uDC38A"]);
+assertEqArray(/.A/u.exec("\uDC00\uDC38A"),
+              ["\uDC38A"]);
+
+// ==== leading multiple ====
+
+assertEqArray(/.*A/u.exec("\u{1F438}\u{1F438}\u{1F438}A"),
+              ["\u{1F438}\u{1F438}\u{1F438}A"]);
+
+// ==== trailing multiple ====
+
+assertEqArray(/A.*/u.exec("A\u{1F438}\u{1F438}\u{1F438}"),
+              ["A\u{1F438}\u{1F438}\u{1F438}"]);
+
+if (typeof reportCompare === "function")
+    reportCompare(true, true);