Bug 1338373 - Update word boundary in RegExp with unicode and ignoreCase flags to include U+017F and U+212A. r=till
authorTooru Fujisawa <arai_a@mac.com>
Sat, 11 Feb 2017 11:47:57 +0900
changeset 342321 c043f1737e222180549cb754ddd2e83f0d2223bd
parent 342320 93ff4280ee796767ccbb6f24594fe776af406aa6
child 342322 3ac2e09d98cb55fb9440b0cb9267ed365293aa9a
push id86828
push userarai_a@mac.com
push dateSat, 11 Feb 2017 02:48:56 +0000
treeherdermozilla-inbound@c043f1737e22 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstill
bugs1338373
milestone54.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1338373 - Update word boundary in RegExp with unicode and ignoreCase flags to include U+017F and U+212A. r=till
js/src/irregexp/RegExpEngine.cpp
js/src/irregexp/RegExpEngine.h
js/src/tests/ecma_6/RegExp/unicode-ignoreCase-word-boundary.js
--- a/js/src/irregexp/RegExpEngine.cpp
+++ b/js/src/irregexp/RegExpEngine.cpp
@@ -2249,17 +2249,20 @@ BoyerMoorePositionInfo::Set(int characte
 {
     SetInterval(Interval(character, character));
 }
 
 void
 BoyerMoorePositionInfo::SetInterval(const Interval& interval)
 {
     s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
-    w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
+    if (unicode_ignore_case_)
+        w_ = AddRange(w_, kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, interval);
+    else
+        w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
     d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
     surrogate_ =
         AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
     if (interval.to() - interval.from() >= kMapSize - 1) {
         if (map_count_ != kMapSize) {
             map_count_ = kMapSize;
             for (int i = 0; i < kMapSize; i++)
                 map_[i] = true;
@@ -2286,21 +2289,22 @@ BoyerMoorePositionInfo::SetAll()
         for (int i = 0; i < kMapSize; i++)
             map_[i] = true;
     }
 }
 
 BoyerMooreLookahead::BoyerMooreLookahead(LifoAlloc* alloc, size_t length, RegExpCompiler* compiler)
   : length_(length), compiler_(compiler), bitmaps_(*alloc)
 {
+    bool unicode_ignore_case = compiler->unicode() && compiler->ignore_case();
     max_char_ = MaximumCharacter(compiler->ascii());
 
     bitmaps_.reserve(length);
     for (size_t i = 0; i < length; i++)
-        bitmaps_.append(alloc->newInfallible<BoyerMoorePositionInfo>(alloc));
+        bitmaps_.append(alloc->newInfallible<BoyerMoorePositionInfo>(alloc, unicode_ignore_case));
 }
 
 // Find the longest range of lookahead that has the fewest number of different
 // characters that can occur at a given position.  Since we are optimizing two
 // different parameters at once this is a tradeoff.
 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
   int biggest_points = 0;
   // If more than 32 characters out of 128 can occur it is unlikely that we can
@@ -2956,25 +2960,32 @@ EmitNotInSurrogatePair(RegExpCompiler* c
 
     assembler->Bind(&ok);
     on_success->Emit(compiler, &new_trace);
 }
 
 // Check for [0-9A-Z_a-z].
 static void
 EmitWordCheck(RegExpMacroAssembler* assembler,
-              jit::Label* word, jit::Label* non_word, bool fall_through_on_word)
+              jit::Label* word, jit::Label* non_word, bool fall_through_on_word,
+              bool unicode_ignore_case)
 {
-    if (assembler->CheckSpecialCharacterClass(fall_through_on_word ? 'w' : 'W',
+    if (!unicode_ignore_case &&
+        assembler->CheckSpecialCharacterClass(fall_through_on_word ? 'w' : 'W',
                                               fall_through_on_word ? non_word : word))
     {
         // Optimized implementation available.
         return;
     }
 
+    if (unicode_ignore_case) {
+        assembler->CheckCharacter(0x017F, word);
+        assembler->CheckCharacter(0x212A, word);
+    }
+
     assembler->CheckCharacterGT('z', non_word);
     assembler->CheckCharacterLT('0', non_word);
     assembler->CheckCharacterGT('a' - 1, word);
     assembler->CheckCharacterLT('9' + 1, word);
     assembler->CheckCharacterLT('A', non_word);
     assembler->CheckCharacterLT('Z' + 1, word);
 
     if (fall_through_on_word)
@@ -3013,17 +3024,18 @@ AssertionNode::EmitBoundaryCheck(RegExpC
     bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
     if (next_is_word_character == Trace::UNKNOWN) {
         jit::Label before_non_word;
         jit::Label before_word;
         if (trace->characters_preloaded() != 1) {
             assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
         }
         // Fall through on non-word.
-        EmitWordCheck(assembler, &before_word, &before_non_word, false);
+        EmitWordCheck(assembler, &before_word, &before_non_word, false,
+                      compiler->unicode() && compiler->ignore_case());
         // Next character is not a word character.
         assembler->Bind(&before_non_word);
         jit::Label ok;
         BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
         assembler->JumpOrBacktrack(&ok);
 
         assembler->Bind(&before_word);
         BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
@@ -3053,17 +3065,18 @@ AssertionNode::BacktrackIfPrevious(RegEx
     if (new_trace.cp_offset() == 0) {
         // The start of input counts as a non-word character, so the question is
         // decided if we are at the start.
         assembler->CheckAtStart(non_word);
     }
     // We already checked that we are not at the start of input so it must be
     // OK to load the previous character.
     assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
-    EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
+    EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord,
+                  compiler->unicode() && compiler->ignore_case());
 
     assembler->Bind(&fall_through);
     on_success()->Emit(compiler, &new_trace);
 }
 
 void
 AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
                                     RegExpCompiler* compiler,
--- a/js/src/irregexp/RegExpEngine.h
+++ b/js/src/irregexp/RegExpEngine.h
@@ -1190,23 +1190,24 @@ ContainedInLattice
 AddRange(ContainedInLattice a,
          const int* ranges,
          int ranges_size,
          Interval new_range);
 
 class BoyerMoorePositionInfo
 {
   public:
-    explicit BoyerMoorePositionInfo(LifoAlloc* alloc)
+    explicit BoyerMoorePositionInfo(LifoAlloc* alloc, bool unicode_ignore_case)
       : map_(*alloc),
         map_count_(0),
         w_(kNotYet),
         s_(kNotYet),
         d_(kNotYet),
-        surrogate_(kNotYet)
+        surrogate_(kNotYet),
+        unicode_ignore_case_(unicode_ignore_case)
     {
         map_.reserve(kMapSize);
         for (int i = 0; i < kMapSize; i++)
             map_.append(false);
     }
 
     bool& at(int i) { return map_[i]; }
 
@@ -1223,16 +1224,19 @@ class BoyerMoorePositionInfo
 
   private:
     InfallibleVector<bool, 0> map_;
     int map_count_;  // Number of set bits in the map.
     ContainedInLattice w_;  // The \w character class.
     ContainedInLattice s_;  // The \s character class.
     ContainedInLattice d_;  // The \d character class.
     ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
+
+    // True if the RegExp has unicode and ignoreCase flags.
+    bool unicode_ignore_case_;
 };
 
 typedef InfallibleVector<BoyerMoorePositionInfo*, 1> BoyerMoorePositionInfoVector;
 
 class BoyerMooreLookahead
 {
   public:
     BoyerMooreLookahead(LifoAlloc* alloc, size_t length, RegExpCompiler* compiler);
new file mode 100644
--- /dev/null
+++ b/js/src/tests/ecma_6/RegExp/unicode-ignoreCase-word-boundary.js
@@ -0,0 +1,25 @@
+var BUGNUMBER = 1338373;
+var summary = "Word boundary should match U+017F and U+212A in unicode+ignoreCase.";
+
+assertEq(/\b/iu.test('\u017F'), true);
+assertEq(/\b/i.test('\u017F'), false);
+assertEq(/\b/u.test('\u017F'), false);
+assertEq(/\b/.test('\u017F'), false);
+
+assertEq(/\b/iu.test('\u212A'), true);
+assertEq(/\b/i.test('\u212A'), false);
+assertEq(/\b/u.test('\u212A'), false);
+assertEq(/\b/.test('\u212A'), false);
+
+assertEq(/\B/iu.test('\u017F'), false);
+assertEq(/\B/i.test('\u017F'), true);
+assertEq(/\B/u.test('\u017F'), true);
+assertEq(/\B/.test('\u017F'), true);
+
+assertEq(/\B/iu.test('\u212A'), false);
+assertEq(/\B/i.test('\u212A'), true);
+assertEq(/\B/u.test('\u212A'), true);
+assertEq(/\B/.test('\u212A'), true);
+
+if (typeof reportCompare === "function")
+    reportCompare(true, true);