Bug 1320121 - Generate irregexp character tables with make_unicode.py. r=arai
authorAndré Bargull <andre.bargull@gmail.com>
Wed, 30 Nov 2016 13:18:23 -0800
changeset 325034 2b38638df04ddb0d688909b0286bf5452c879c18
parent 325033 7de007e9921283029004cb7a274eb6a5b27a6f56
child 325035 6d05dfdbfac25ffb73193b11f1c951f66bee4544
push id31026
push usercbook@mozilla.com
push dateFri, 02 Dec 2016 08:24:04 +0000
treeherdermozilla-central@f65ad27efe83 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersarai
bugs1320121
milestone53.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1320121 - Generate irregexp character tables with make_unicode.py. r=arai
js/src/irregexp/RegExpCharacters-inl.h
js/src/irregexp/RegExpCharacters.cpp
js/src/irregexp/RegExpCharacters.h
js/src/irregexp/RegExpEngine.cpp
js/src/moz.build
js/src/vm/make_unicode.py
new file mode 100644
--- /dev/null
+++ b/js/src/irregexp/RegExpCharacters-inl.h
@@ -0,0 +1,40 @@
+/* Generated by make_unicode.py DO NOT MODIFY */
+/* Unicode version: 9.0.0 */
+#ifndef V8_JSREGEXPCHARACTERS_INL_H_
+#define V8_JSREGEXPCHARACTERS_INL_H_
+
+namespace js {
+
+namespace irregexp {
+
+static inline bool
+RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)
+{
+    if (unicode) {
+        // "LATIN SMALL LETTER LONG S" case folds to "LATIN SMALL LETTER S".
+        if (range.Contains(0x017F))
+            return true;
+        // "LATIN CAPITAL LETTER SHARP S" case folds to "LATIN SMALL LETTER SHARP S".
+        if (range.Contains(0x1E9E))
+            return true;
+        // "KELVIN SIGN" case folds to "LATIN SMALL LETTER K".
+        if (range.Contains(0x212A))
+            return true;
+        // "ANGSTROM SIGN" case folds to "LATIN SMALL LETTER A WITH RING ABOVE".
+        if (range.Contains(0x212B))
+            return true;
+    }
+
+    // "GREEK CAPITAL LETTER MU" case maps to "MICRO SIGN".
+    // "GREEK SMALL LETTER MU" case maps to "MICRO SIGN".
+    if (range.Contains(0x039C) || range.Contains(0x03BC))
+        return true;
+    // "LATIN CAPITAL LETTER Y WITH DIAERESIS" case maps to "LATIN SMALL LETTER Y WITH DIAERESIS".
+    if (range.Contains(0x0178))
+        return true;
+    return false;
+}
+
+} } // namespace js::irregexp
+
+#endif // V8_JSREGEXPCHARACTERS_INL_H_
new file mode 100644
--- /dev/null
+++ b/js/src/irregexp/RegExpCharacters.cpp
@@ -0,0 +1,135 @@
+/* Generated by make_unicode.py DO NOT MODIFY */
+/* Unicode version: 9.0.0 */
+#include "irregexp/RegExpCharacters.h"
+
+#include "mozilla/Assertions.h"
+
+char16_t
+js::irregexp::ConvertNonLatin1ToLatin1(char16_t c, bool unicode)
+{
+    MOZ_ASSERT(c > 0xFF, "Character mustn't be Latin1");
+    if (unicode) {
+        // "LATIN SMALL LETTER LONG S" case folds to "LATIN SMALL LETTER S".
+        if (c == 0x017F)
+            return 0x73;
+        // "LATIN CAPITAL LETTER SHARP S" case folds to "LATIN SMALL LETTER SHARP S".
+        if (c == 0x1E9E)
+            return 0xDF;
+        // "KELVIN SIGN" case folds to "LATIN SMALL LETTER K".
+        if (c == 0x212A)
+            return 0x6B;
+        // "ANGSTROM SIGN" case folds to "LATIN SMALL LETTER A WITH RING ABOVE".
+        if (c == 0x212B)
+            return 0xE5;
+    }
+
+    // "GREEK CAPITAL LETTER MU" case maps to "MICRO SIGN".
+    // "GREEK SMALL LETTER MU" case maps to "MICRO SIGN".
+    if (c == 0x039C || c == 0x03BC)
+        return 0xB5;
+    // "LATIN CAPITAL LETTER Y WITH DIAERESIS" case maps to "LATIN SMALL LETTER Y WITH DIAERESIS".
+    if (c == 0x0178)
+        return 0xFF;
+    return 0;
+}
+
+const int js::irregexp::kSpaceRanges[] = {
+    0x0009, 0x000D + 1, // CHARACTER TABULATION..CARRIAGE RETURN (CR)
+    0x0020, 0x0020 + 1, // SPACE
+    0x00A0, 0x00A0 + 1, // NO-BREAK SPACE
+    0x1680, 0x1680 + 1, // OGHAM SPACE MARK
+    0x2000, 0x200A + 1, // EN QUAD..HAIR SPACE
+    0x2028, 0x2029 + 1, // LINE SEPARATOR..PARAGRAPH SEPARATOR
+    0x202F, 0x202F + 1, // NARROW NO-BREAK SPACE
+    0x205F, 0x205F + 1, // MEDIUM MATHEMATICAL SPACE
+    0x3000, 0x3000 + 1, // IDEOGRAPHIC SPACE
+    0xFEFF, 0xFEFF + 1, // ZERO WIDTH NO-BREAK SPACE
+    0xFFFF + 1
+};
+const int js::irregexp::kSpaceRangeCount = 21;
+
+const int js::irregexp::kSpaceAndSurrogateRanges[] = {
+    0x0009, 0x000D + 1, // CHARACTER TABULATION..CARRIAGE RETURN (CR)
+    0x0020, 0x0020 + 1, // SPACE
+    0x00A0, 0x00A0 + 1, // NO-BREAK SPACE
+    0x1680, 0x1680 + 1, // OGHAM SPACE MARK
+    0x2000, 0x200A + 1, // EN QUAD..HAIR SPACE
+    0x2028, 0x2029 + 1, // LINE SEPARATOR..PARAGRAPH SEPARATOR
+    0x202F, 0x202F + 1, // NARROW NO-BREAK SPACE
+    0x205F, 0x205F + 1, // MEDIUM MATHEMATICAL SPACE
+    0x3000, 0x3000 + 1, // IDEOGRAPHIC SPACE
+    0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max>
+    0xFEFF, 0xFEFF + 1, // ZERO WIDTH NO-BREAK SPACE
+    0xFFFF + 1
+};
+const int js::irregexp::kSpaceAndSurrogateRangeCount = 23;
+
+const int js::irregexp::kWordRanges[] = {
+    0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+    0x0041, 0x005A + 1, // LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+    0x005F, 0x005F + 1, // LOW LINE
+    0x0061, 0x007A + 1, // LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+    0xFFFF + 1
+};
+const int js::irregexp::kWordRangeCount = 9;
+
+const int js::irregexp::kIgnoreCaseWordRanges[] = {
+    0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+    0x0041, 0x005A + 1, // LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+    0x005F, 0x005F + 1, // LOW LINE
+    0x0061, 0x007A + 1, // LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+    0x017F, 0x017F + 1, // LATIN SMALL LETTER LONG S
+    0x212A, 0x212A + 1, // KELVIN SIGN
+    0xFFFF + 1
+};
+const int js::irregexp::kIgnoreCaseWordRangeCount = 13;
+
+const int js::irregexp::kWordAndSurrogateRanges[] = {
+    0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+    0x0041, 0x005A + 1, // LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+    0x005F, 0x005F + 1, // LOW LINE
+    0x0061, 0x007A + 1, // LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+    0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max>
+    0xFFFF + 1
+};
+const int js::irregexp::kWordAndSurrogateRangeCount = 11;
+
+const int js::irregexp::kNegatedIgnoreCaseWordAndSurrogateRanges[] = {
+    0x0000, 0x002F + 1, // NULL..SOLIDUS
+    0x003A, 0x0040 + 1, // COLON..COMMERCIAL AT
+    0x005B, 0x005E + 1, // LEFT SQUARE BRACKET..CIRCUMFLEX ACCENT
+    0x0060, 0x0060 + 1, // GRAVE ACCENT
+    0x007B, 0x017E + 1, // LEFT CURLY BRACKET..LATIN SMALL LETTER Z WITH CARON
+    0x0180, 0x2129 + 1, // LATIN SMALL LETTER B WITH STROKE..TURNED GREEK SMALL LETTER IOTA
+    0x212B, 0xD7FF + 1, // ANGSTROM SIGN..<Unused>
+    0xE000, 0xFFFF + 1, // Private Use..<Unused>
+    0xFFFF + 1
+};
+const int js::irregexp::kNegatedIgnoreCaseWordAndSurrogateRangeCount = 17;
+
+const int js::irregexp::kDigitRanges[] = {
+    0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+    0xFFFF + 1
+};
+const int js::irregexp::kDigitRangeCount = 3;
+
+const int js::irregexp::kDigitAndSurrogateRanges[] = {
+    0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+    0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max>
+    0xFFFF + 1
+};
+const int js::irregexp::kDigitAndSurrogateRangeCount = 5;
+
+const int js::irregexp::kSurrogateRanges[] = {
+    0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max>
+    0xFFFF + 1
+};
+const int js::irregexp::kSurrogateRangeCount = 3;
+
+const int js::irregexp::kLineTerminatorRanges[] = {
+    0x000A, 0x000A + 1, // LINE FEED (LF)
+    0x000D, 0x000D + 1, // CARRIAGE RETURN (CR)
+    0x2028, 0x2029 + 1, // LINE SEPARATOR..PARAGRAPH SEPARATOR
+    0xFFFF + 1
+};
+const int js::irregexp::kLineTerminatorRangeCount = 7;
new file mode 100644
--- /dev/null
+++ b/js/src/irregexp/RegExpCharacters.h
@@ -0,0 +1,90 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99: */
+
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef V8_JSREGEXPCHARACTERS_H_
+#define V8_JSREGEXPCHARACTERS_H_
+
+namespace js {
+
+namespace irregexp {
+
+char16_t
+ConvertNonLatin1ToLatin1(char16_t c, bool unicode);
+
+// -------------------------------------------------------------------
+// CharacterRange
+
+// The ranges have inclusive from and exclusive to.
+
+// This covers \s as defined in ES2016, 21.2.2.12 CharacterClassEscape,
+// which includes WhiteSpace (11.2) and LineTerminator (11.3) values.
+extern const int kSpaceRanges[];
+extern const int kSpaceRangeCount;
+
+// Characters in \s and additionally all surrogate characters.
+extern const int kSpaceAndSurrogateRanges[];
+extern const int kSpaceAndSurrogateRangeCount;
+
+// This covers \w as defined in ES2016, 21.2.2.12 CharacterClassEscape.
+extern const int kWordRanges[];
+extern const int kWordRangeCount;
+
+// Characters which case-fold to characters in \w.
+extern const int kIgnoreCaseWordRanges[];
+extern const int kIgnoreCaseWordRangeCount;
+
+// Characters in \w and additionally all surrogate characters.
+extern const int kWordAndSurrogateRanges[];
+extern const int kWordAndSurrogateRangeCount;
+
+// All characters excluding those which case-fold to \w and excluding all
+// surrogate characters.
+extern const int kNegatedIgnoreCaseWordAndSurrogateRanges[];
+extern const int kNegatedIgnoreCaseWordAndSurrogateRangeCount;
+
+// This covers \d as defined in ES2016, 21.2.2.12 CharacterClassEscape.
+extern const int kDigitRanges[];
+extern const int kDigitRangeCount;
+
+// Characters in \d and additionally all surrogate characters.
+extern const int kDigitAndSurrogateRanges[];
+extern const int kDigitAndSurrogateRangeCount;
+
+// The range of all surrogate characters.
+extern const int kSurrogateRanges[];
+extern const int kSurrogateRangeCount;
+
+// Line terminators as defined in ES2016, 11.3 LineTerminator.
+extern const int kLineTerminatorRanges[];
+extern const int kLineTerminatorRangeCount;
+
+} } // namespace js::irregexp
+
+#endif // V8_JSREGEXPCHARACTERS_H_
--- a/js/src/irregexp/RegExpEngine.cpp
+++ b/js/src/irregexp/RegExpEngine.cpp
@@ -26,19 +26,22 @@
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "irregexp/RegExpEngine.h"
 
 #include "irregexp/NativeRegExpMacroAssembler.h"
+#include "irregexp/RegExpCharacters.h"
 #include "irregexp/RegExpMacroAssembler.h"
 #include "jit/JitCommon.h"
 
+#include "irregexp/RegExpCharacters-inl.h"
+
 using namespace js;
 using namespace js::irregexp;
 
 using mozilla::ArrayLength;
 using mozilla::DebugOnly;
 using mozilla::Maybe;
 
 #define DEFINE_ACCEPT(Type)                                          \
@@ -55,71 +58,16 @@ void LoopChoiceNode::Accept(NodeVisitor*
 static const int kMaxLookaheadForBoyerMoore = 8;
 
 RegExpNode::RegExpNode(LifoAlloc* alloc)
   : replacement_(nullptr), trace_count_(0), alloc_(alloc)
 {
     bm_info_[0] = bm_info_[1] = nullptr;
 }
 
-// -------------------------------------------------------------------
-// CharacterRange
-
-// The '2' variant has inclusive from and exclusive to.
-// This covers \s as defined in ES2016, 21.2.2.12 CharacterClassEscape,
-// which include WhiteSpace (11.2) or LineTerminator (11.3) values.
-static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
-    0x00A0, 0x00A1, 0x1680, 0x1681, 0x2000, 0x200B,
-    0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
-    0xFEFF, 0xFF00, 0x10000 };
-static const int kSpaceRangeCount = ArrayLength(kSpaceRanges);
-
-static const int kSpaceAndSurrogateRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
-    0x00A0, 0x00A1, 0x1680, 0x1681, 0x2000, 0x200B,
-    0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
-    unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
-    0xFEFF, 0xFF00, 0x10000 };
-static const int kSpaceAndSurrogateRangeCount = ArrayLength(kSpaceAndSurrogateRanges);
-static const int kWordRanges[] = {
-    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
-static const int kWordRangeCount = ArrayLength(kWordRanges);
-static const int kIgnoreCaseWordRanges[] = {
-    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
-    0x017F, 0x017F + 1, 0x212A, 0x212A + 1,
-    0x10000 };
-static const int kIgnoreCaseWordCount = ArrayLength(kIgnoreCaseWordRanges);
-static const int kWordAndSurrogateRanges[] = {
-    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
-    unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
-    0x10000 };
-static const int kWordAndSurrogateRangeCount = ArrayLength(kWordAndSurrogateRanges);
-static const int kNegatedIgnoreCaseWordAndSurrogateRanges[] = {
-    0, '0', '9' + 1, 'A',
-    'Z' + 1, '_', '_' + 1, 'a',
-    'z' + 1, 0x017F,
-    0x017F + 1, 0x212A,
-    0x212A + 1, unicode::LeadSurrogateMin,
-    unicode::TrailSurrogateMax + 1, 0x10000,
-    0x10000 };
-static const int kNegatedIgnoreCaseWordAndSurrogateRangeCount =
-    ArrayLength(kNegatedIgnoreCaseWordAndSurrogateRanges);
-static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
-static const int kDigitRangeCount = ArrayLength(kDigitRanges);
-static const int kDigitAndSurrogateRanges[] = {
-    '0', '9' + 1,
-    unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
-    0x10000 };
-static const int kDigitAndSurrogateRangeCount = ArrayLength(kDigitAndSurrogateRanges);
-static const int kSurrogateRanges[] = {
-    unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
-    0x10000 };
-static const int kSurrogateRangeCount = ArrayLength(kSurrogateRanges);
-static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
-    0x2028, 0x202A, 0x10000 };
-static const int kLineTerminatorRangeCount = ArrayLength(kLineTerminatorRanges);
 static const int kMaxOneByteCharCode = 0xff;
 static const int kMaxUtf16CodeUnit = 0xffff;
 
 static char16_t
 MaximumCharacter(bool ascii)
 {
     return ascii ? kMaxOneByteCharCode : kMaxUtf16CodeUnit;
 }
@@ -207,17 +155,17 @@ CharacterRange::AddClassEscapeUnicode(Li
       case 'd':
         return AddClassEscape(alloc, type, ranges);
         break;
       case 'S':
         AddClassNegated(kSpaceAndSurrogateRanges, kSpaceAndSurrogateRangeCount, ranges);
         break;
       case 'w':
         if (ignore_case)
-            AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordCount, ranges);
+            AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, ranges);
         else
             AddClassEscape(alloc, type, ranges);
         break;
       case 'W':
         if (ignore_case) {
             AddClass(kNegatedIgnoreCaseWordAndSurrogateRanges,
                      kNegatedIgnoreCaseWordAndSurrogateRangeCount, ranges);
         } else {
@@ -227,43 +175,16 @@ CharacterRange::AddClassEscapeUnicode(Li
       case 'D':
         AddClassNegated(kDigitAndSurrogateRanges, kDigitAndSurrogateRangeCount, ranges);
         break;
       default:
         MOZ_CRASH("Bad type!");
     }
 }
 
-#define FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(macro)      \
-    /* LATIN CAPITAL LETTER Y WITH DIAERESIS */         \
-    macro(0x0178, 0x00FF)                               \
-    /* LATIN SMALL LETTER LONG S */                     \
-    macro(0x017F, 0x0073)                               \
-    /* LATIN CAPITAL LETTER SHARP S */                  \
-    macro(0x1E9E, 0x00DF)                               \
-    /* KELVIN SIGN */                                   \
-    macro(0x212A, 0x006B)                               \
-    /* ANGSTROM SIGN */                                 \
-    macro(0x212B, 0x00E5)
-
-// We need to check for the following characters: 0x39c 0x3bc 0x178.
-static inline bool
-RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)
-{
-    /* TODO(dcarney): this could be a lot more efficient. */
-    if (unicode) {
-#define CHECK_RANGE(C, F) \
-        if (range.Contains(C)) return true;
-FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CHECK_RANGE)
-#undef CHECK_RANGE
-    }
-
-    return range.Contains(0x39c) || range.Contains(0x3bc) || range.Contains(0x178);
-}
-
 static bool
 RangesContainLatin1Equivalents(const CharacterRangeVector& ranges, bool unicode)
 {
     for (size_t i = 0; i < ranges.length(); i++) {
         // TODO(dcarney): this could be a lot more efficient.
         if (RangeContainsLatin1Equivalents(ranges[i], unicode))
             return true;
     }
@@ -330,17 +251,17 @@ GetCaseIndependentLetters(char16_t chara
     char16_t other1 = others.other1();
     char16_t other2 = others.other2();
     char16_t other3 = others.other3();
 
     // ES 2017 draft 996af87b7072b3c3dd2b1def856c66f456102215 21.2.4.2
     // step 3.g.
     // The standard requires that non-ASCII characters cannot have ASCII
     // character codes in their equivalence class, even though this
-    // situation occurs multiple times in the unicode tables.
+    // situation occurs multiple times in the Unicode tables.
     static const unsigned kMaxAsciiCharCode = 127;
     if (upper <= kMaxAsciiCharCode) {
         if (character > kMaxAsciiCharCode) {
             // If Canonicalize(character) == character, all other characters
             // should be ignored.
             return GetCaseIndependentLetters(character, ascii_subject, unicode,
                                              &character, 1, letters);
         }
@@ -359,41 +280,16 @@ GetCaseIndependentLetters(char16_t chara
         other1,
         other2,
         other3
     };
     return GetCaseIndependentLetters(character, ascii_subject, unicode,
                                      choices, ArrayLength(choices), letters);
 }
 
-static char16_t
-ConvertNonLatin1ToLatin1(char16_t c, bool unicode)
-{
-    MOZ_ASSERT(c > kMaxOneByteCharCode);
-    if (unicode) {
-        switch (c) {
-#define CONVERT(C, F) case C: return F;
-FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CONVERT)
-#undef CONVERT
-        }
-    }
-
-    switch (c) {
-      // This are equivalent characters in unicode.
-      case 0x39c:
-      case 0x3bc:
-        return 0xb5;
-      // This is an uppercase of a Latin-1 character
-      // outside of Latin-1.
-      case 0x178:
-        return 0xff;
-    }
-    return 0;
-}
-
 void
 CharacterRange::AddCaseEquivalents(bool is_ascii, bool unicode, CharacterRangeVector* ranges)
 {
     char16_t bottom = from();
     char16_t top = to();
 
     if (is_ascii && !RangeContainsLatin1Equivalents(*this, unicode)) {
         if (bottom > kMaxOneByteCharCode)
--- a/js/src/moz.build
+++ b/js/src/moz.build
@@ -192,16 +192,17 @@ UNIFIED_SOURCES += [
     'gc/Nursery.cpp',
     'gc/RootMarking.cpp',
     'gc/Statistics.cpp',
     'gc/Tracer.cpp',
     'gc/Verifier.cpp',
     'gc/Zone.cpp',
     'irregexp/NativeRegExpMacroAssembler.cpp',
     'irregexp/RegExpAST.cpp',
+    'irregexp/RegExpCharacters.cpp',
     'irregexp/RegExpEngine.cpp',
     'irregexp/RegExpInterpreter.cpp',
     'irregexp/RegExpMacroAssembler.cpp',
     'irregexp/RegExpParser.cpp',
     'irregexp/RegExpStack.cpp',
     'jit/AliasAnalysis.cpp',
     'jit/AliasAnalysisShared.cpp',
     'jit/AlignmentMaskAnalysis.cpp',
--- a/js/src/vm/make_unicode.py
+++ b/js/src/vm/make_unicode.py
@@ -128,16 +128,27 @@ def read_derived_core_properties(derived
         char_property = row[1].strip()
         if '..' not in char_range:
             yield (int(char_range, 16), char_property)
         else:
             [start, end] = char_range.split('..')
             for char in range(int(start, 16), int(end, 16) + 1):
                 yield (char, char_property)
 
+def int_ranges(ints):
+    """ Yields consecutive ranges (inclusive) from integer values. """
+    from itertools import tee, izip_longest
+
+    (a, b) = tee(sorted(ints))
+    start = next(b)
+    for (curr, succ) in izip_longest(a, b):
+        if curr + 1 != succ:
+            yield (start, curr)
+            start = succ
+
 def utf16_encode(code):
     NonBMPMin = 0x10000
     LeadSurrogateMin = 0xD800
     TrailSurrogateMin = 0xDC00
 
     lead = (code - NonBMPMin) / 1024 + LeadSurrogateMin
     trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin
 
@@ -853,16 +864,214 @@ def splitbins(t):
     dump(t1, t2, shift, bytes)
 
     # exhaustively verify that the decomposition is correct
     mask = 2**shift - 1
     for i in range(len(t)):
         assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
     return best
 
+def make_irregexp_tables(version,
+                         table, index,
+                         folding_table, folding_index,
+                         test_table):
+    import string
+    from functools import partial
+    from itertools import chain, ifilter, imap
+
+    MAX_ASCII = 0x7F
+    MAX_LATIN1 = 0xFF
+    LEAD_SURROGATE_MIN = 0xD800
+    TRAIL_SURROGATE_MAX = 0xDFFF
+
+    def hex2(n):
+        assert 0 <= n and n < 16**2
+        return '0x{:02X}'.format(n)
+
+    def hex4(n):
+        assert 0 <= n and n < 16**4
+        return '0x{:04X}'.format(n)
+
+    def uhex4(n):
+        assert 0 <= n and n < 16**4
+        return 'U+{:04X}'.format(n)
+
+    def case_info(code):
+        assert 0 <= code and code <= MAX_BMP
+        (upper, lower, flags) = table[index[code]]
+        return ((code + upper) & 0xffff, (code + lower) & 0xffff, flags)
+
+    def is_space(code):
+        (_, _, flags) = case_info(code)
+        return bool(flags & FLAG_SPACE)
+
+    def to_upper(code):
+        (upper, _, _) = case_info(code)
+        return upper
+
+    def casefold(code):
+        assert 0 <= code and code <= MAX_BMP
+        (folding, _, _, _) = folding_table[folding_index[code]]
+        return (code + folding) & 0xffff
+
+    def casefolds_to_ascii(code):
+        return casefold(code) <= MAX_ASCII
+
+    def casefolds_to_latin1(code):
+        return casefold(code) <= MAX_LATIN1
+
+    def casemaps_to_nonlatin1(code):
+        upper = to_upper(code)
+        return upper > MAX_LATIN1
+
+    def char_name(code):
+        assert 0 <= code and code <= MAX_BMP
+        if code not in test_table:
+            return '<Unused>'
+        if code == LEAD_SURROGATE_MIN:
+            return '<Lead Surrogate Min>'
+        if code == TRAIL_SURROGATE_MAX:
+            return '<Trail Surrogate Max>'
+        (_, _, name, alias) = test_table[code]
+        return name if not name.startswith('<') else alias
+
+    def write_character_range(println, name, characters):
+        char_ranges = list(int_ranges(characters))
+        println('')
+        println('const int js::irregexp::k{}Ranges[] = {{'.format(name))
+        for (start, end) in char_ranges:
+            s_name = char_name(start)
+            e_name = char_name(end)
+            println('    {}, {} + 1, // {}'.format(hex4(start), hex4(end),
+                                                               '{}..{}'.format(s_name, e_name)
+                                                               if start != end else s_name))
+        println('    {} + 1'.format(hex4(MAX_BMP)))
+        println('};')
+        println('const int js::irregexp::k{}RangeCount = {};'.format(name,
+                                                                     len(char_ranges) * 2 + 1))
+
+    def write_character_test(println, test, consequent, default):
+        # Latin1 characters which, when case-mapped through
+        # String.prototype.toUpperCase(), canonicalize to a non-Latin1 character.
+        # ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize
+        casemapped_to_nonlatin1 = ifilter(casemaps_to_nonlatin1, xrange(0, MAX_LATIN1 + 1))
+
+        def casemap_closure(ch):
+            upper = to_upper(ch)
+            return (ch, [c for c in xrange(MAX_LATIN1 + 1, MAX_BMP + 1) if upper == to_upper(c)])
+
+        # Mapping from Latin1 characters to the list of case map equivalent
+        # non-Latin1 characters.
+        casemap_for_latin1 = dict(chain(imap(casemap_closure, casemapped_to_nonlatin1)))
+
+        # Non-latin1 characters which, when Unicode case-folded, canonicalize to
+        # a Latin1 character.
+        # ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize
+        casefolded_to_latin1 = ifilter(casefolds_to_latin1, xrange(MAX_LATIN1 + 1, MAX_BMP + 1))
+
+        println('    if (unicode) {')
+        for ch in casefolded_to_latin1:
+            casefolded = casefold(ch)
+            # Skip if also handled below for case mapping.
+            if casefolded in casemap_for_latin1 and ch in casemap_for_latin1[casefolded]:
+                continue
+            println('        // "{}" case folds to "{}".'.format(char_name(ch),
+                                                                 char_name(casefolded)))
+            println('        if ({})'.format(test(ch)))
+            println('            return {};'.format(consequent(casefolded)))
+        println('    }')
+        println('')
+        for (ch, casemapped_chars) in casemap_for_latin1.iteritems():
+            for casemapped in casemapped_chars:
+                println('    // "{}" case maps to "{}".'.format(char_name(casemapped),
+                                                                char_name(ch)))
+            println('    if ({})'.format(' || '.join(imap(test, casemapped_chars))))
+            println('        return {};'.format(consequent(ch)))
+        println('    return {};'.format(default))
+
+    with io.open('../irregexp/RegExpCharacters-inl.h', 'wb') as chars_file:
+        write = partial(print, file=chars_file, sep='', end='')
+        println = partial(write, end='\n')
+
+        write(warning_message)
+        write(unicode_version_message.format(version))
+
+        println('#ifndef V8_JSREGEXPCHARACTERS_INL_H_')
+        println('#define V8_JSREGEXPCHARACTERS_INL_H_')
+        println('')
+        println('namespace js {')
+        println('')
+        println('namespace irregexp {')
+        println('')
+
+        println('static inline bool')
+        println('RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)')
+        println('{')
+        write_character_test(println, lambda ch: 'range.Contains({})'.format(hex4(ch)),
+                             lambda _: 'true', 'false')
+        println('}')
+
+        println('')
+        println('} } // namespace js::irregexp')
+        println('')
+        println('#endif // V8_JSREGEXPCHARACTERS_INL_H_')
+
+    with io.open('../irregexp/RegExpCharacters.cpp', 'wb') as chars_file:
+        write = partial(print, file=chars_file, sep='', end='')
+        println = partial(write, end='\n')
+        character_range = partial(write_character_range, println)
+
+        # Characters in \s, 21.2.2.12 CharacterClassEscape.
+        space_chars = filter(is_space, xrange(0, MAX_BMP + 1))
+
+        # Characters in \d, 21.2.2.12 CharacterClassEscape.
+        digit_chars = map(ord, string.digits)
+        assert all(ch <= MAX_ASCII for ch in digit_chars)
+
+        # Characters in \w, 21.2.2.12 CharacterClassEscape.
+        word_chars = map(ord, string.digits + string.ascii_letters + '_')
+        assert all(ch <= MAX_ASCII for ch in word_chars)
+
+        # Characters which case-fold to characters in \w.
+        ignorecase_word_chars = (word_chars +
+                                filter(casefolds_to_ascii, xrange(MAX_ASCII + 1, MAX_BMP + 1)))
+
+        # Surrogate characters.
+        surrogate_chars = range(LEAD_SURROGATE_MIN, TRAIL_SURROGATE_MAX + 1)
+
+        write(warning_message)
+        write(unicode_version_message.format(version))
+        println('#include "irregexp/RegExpCharacters.h"')
+        println('')
+        println('#include "mozilla/Assertions.h"')
+        println('')
+
+        println('char16_t')
+        println('js::irregexp::ConvertNonLatin1ToLatin1(char16_t c, bool unicode)')
+        println('{')
+        println('    MOZ_ASSERT(c > {}, "Character mustn\'t be Latin1");'.format(hex2(MAX_LATIN1)))
+        write_character_test(println, lambda ch: 'c == {}'.format(hex4(ch)), hex2, '0')
+        println('}')
+
+        character_range('Space', space_chars)
+        character_range('SpaceAndSurrogate', space_chars + surrogate_chars)
+
+        character_range('Word', word_chars)
+        character_range('IgnoreCaseWord', ignorecase_word_chars)
+        character_range('WordAndSurrogate', word_chars + surrogate_chars)
+        character_range('NegatedIgnoreCaseWordAndSurrogate',
+                        set(xrange(0, MAX_BMP + 1)) - set(ignorecase_word_chars + surrogate_chars))
+
+        character_range('Digit', digit_chars)
+        character_range('DigitAndSurrogate', digit_chars + surrogate_chars)
+
+        character_range('Surrogate', surrogate_chars)
+
+        character_range('LineTerminator', line_terminator)
+
 def update_unicode(args):
     import urllib2
 
     version = args.version
     if version is not None:
         baseurl = 'http://unicode.org/Public'
         if version == 'UNIDATA':
             url = '%s/%s' % (baseurl, version)
@@ -924,16 +1133,20 @@ def update_unicode(args):
                       table, index,
                       same_upper_table, same_upper_index,
                       folding_table, folding_index,
                       non_bmp_space_set,
                       non_bmp_id_start_set, non_bmp_id_cont_set)
     make_non_bmp_file(unicode_version,
                       non_bmp_lower_map, non_bmp_upper_map,
                       non_bmp_folding_map, non_bmp_rev_folding_map)
+    make_irregexp_tables(unicode_version,
+                         table, index,
+                         folding_table, folding_index,
+                         test_table)
 
     make_bmp_mapping_test(unicode_version, test_table)
     make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map)
     make_space_test(unicode_version, test_space_table)
     make_regexp_space_test(unicode_version, test_space_table)
     make_icase_test(unicode_version, folding_tests)
 
 if __name__ == '__main__':