Bug 1157277 - Part 1: Generate macros for non-BMP lowercase/uppercase/folding with make_unicode.py. r=till
authorTooru Fujisawa <arai_a@mac.com>
Wed, 20 Jul 2016 14:11:35 +0900
changeset 345815 0a822e7f78cbb140a6172b134ba05b4c0a83e6e4
parent 345814 87c0b92cae4244ff7f0156f1fc90cb161f9243e9
child 345816 de1cf380b1d55c91ce5bd7c07f917510fa98a55e
push id6389
push userraliiev@mozilla.com
push dateMon, 19 Sep 2016 13:38:22 +0000
treeherdermozilla-beta@01d67bfe6c81 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstill
bugs1157277
milestone50.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1157277 - Part 1: Generate macros for non-BMP lowercase/uppercase/folding with make_unicode.py. r=till
js/src/irregexp/RegExpParser.cpp
js/src/vm/Unicode.h
js/src/vm/UnicodeNonBMP.h
js/src/vm/make_unicode.py
--- a/js/src/irregexp/RegExpParser.cpp
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -868,16 +868,17 @@ UnicodeRangesAtom(LifoAlloc* alloc,
                   bool ignore_case)
 {
     // Calculate case folding for non-BMP first and negate the range if needed.
     if (ignore_case) {
         WideCharRangeVector* tmp_wide_ranges = nullptr;
 #define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
         CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges);
         FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC)
+        FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_CALC)
 #undef CALL_CALC
 
         if (tmp_wide_ranges) {
             for (size_t i = 0; i < tmp_wide_ranges->length(); i++)
                 wide_ranges->append((*tmp_wide_ranges)[i]);
         }
     }
 
@@ -1309,16 +1310,17 @@ CaseFoldingSurrogatePairAtom(LifoAlloc* 
 static inline RegExpTree*
 SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case)
 {
     if (ignore_case) {
 #define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
         if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \
             return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF);
         FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM)
+        FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_ATOM)
 #undef CALL_ATOM
     }
 
     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
     builder->AddCharacter(lead);
     builder->AddCharacter(trail);
     return builder->ToRegExp();
 }
--- a/js/src/vm/Unicode.h
+++ b/js/src/vm/Unicode.h
@@ -3,16 +3,17 @@
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef vm_Unicode_h
 #define vm_Unicode_h
 
 #include "jspubtd.h"
+#include "vm/UnicodeNonBMP.h"
 
 extern const bool js_isidstart[];
 extern const bool js_isident[];
 extern const bool js_isspace[];
 
 namespace js {
 namespace unicode {
 
@@ -319,17 +320,9 @@ UTF16Decode(size_t lead, size_t trail)
     MOZ_ASSERT(IsTrailSurrogate(trail));
 
     return (lead - LeadSurrogateMin) * 1024 + (trail - TrailSurrogateMin) + NonBMPMin;
 }
 
 } /* namespace unicode */
 } /* namespace js */
 
-#define FOR_EACH_NON_BMP_CASE_FOLDING(macro)                            \
-    macro(0x10400, 0x10427, 0xD801, 0xDC00, 0xDC27, 0x28)               \
-    macro(0x10428, 0x1044F, 0xD801, 0xDC28, 0xDC4F, -0x28)              \
-    macro(0x10C80, 0x10CB2, 0xD803, 0xDC80, 0xDCB2, 0x40)               \
-    macro(0x10CC0, 0x10CF2, 0xD803, 0xDCC0, 0xDCF2, -0x40)              \
-    macro(0x118A0, 0x118bf, 0xD806, 0xDCA0, 0xDCBF, 0x20)               \
-    macro(0x118C0, 0x118df, 0xD806, 0xDCC0, 0xDCDF, -0x20)
-
 #endif /* vm_Unicode_h */
new file mode 100644
--- /dev/null
+++ b/js/src/vm/UnicodeNonBMP.h
@@ -0,0 +1,28 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Generated by make_unicode.py DO NOT MODIFY */
+
+#ifndef vm_UnicodeNonBMP_h
+#define vm_UnicodeNonBMP_h
+
+#define FOR_EACH_NON_BMP_LOWERCASE(macro) \
+    macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40)
+
+#define FOR_EACH_NON_BMP_UPPERCASE(macro) \
+    macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40)
+
+#define FOR_EACH_NON_BMP_CASE_FOLDING(macro) \
+    macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40) \
+    macro(0x10c80, 0x10cb2, 0xd803, 0xdc80, 0xdcb2, 64) \
+    macro(0x118a0, 0x118bf, 0xd806, 0xdca0, 0xdcbf, 32)
+
+#define FOR_EACH_NON_BMP_REV_CASE_FOLDING(macro) \
+    macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40) \
+    macro(0x10cc0, 0x10cf2, 0xd803, 0xdcc0, 0xdcf2, -64) \
+    macro(0x118c0, 0x118df, 0xd806, 0xdcc0, 0xdcdf, -32)
+
+#endif /* vm_UnicodeNonBMP_h */
--- a/js/src/vm/make_unicode.py
+++ b/js/src/vm/make_unicode.py
@@ -90,64 +90,116 @@ def read_case_folding(case_folding):
             continue
         row = line.split('; ')
         if row[1] in ['F', 'T']:
             continue
         row[0] = int(row[0], 16)
         row[2] = int(row[2], 16)
         yield row
 
+def utf16_encode(code):
+    NonBMPMin = 0x10000
+    LeadSurrogateMin = 0xD800
+    TrailSurrogateMin = 0xDC00
+
+    lead = (code - NonBMPMin) / 1024 + LeadSurrogateMin
+    trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin
+
+    return lead, trail
+
+def make_non_bmp_convert_macro(out_file, name, convert_map):
+    convert_list = []
+    entry = None
+    for code in sorted(convert_map.keys()):
+        converted = convert_map[code]
+        diff = converted - code
+
+        if entry and code == entry['code'] + entry['length'] and diff == entry['diff']:
+            entry['length'] += 1
+            continue
+
+        entry = { 'code': code, 'diff': diff, 'length': 1 }
+        convert_list.append(entry)
+
+    lines = []
+    for entry in convert_list:
+        from_code = entry['code']
+        to_code = entry['code'] + entry['length'] - 1
+        diff = entry['diff']
+
+        from_lead, from_trail = utf16_encode(from_code)
+        to_lead, to_trail = utf16_encode(to_code)
+
+        assert from_lead == to_lead
+
+        lines.append('    macro(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})'.format(
+            from_code, to_code, from_lead, from_trail, to_trail, diff))
+
+    out_file.write('#define FOR_EACH_NON_BMP_{}(macro) \\\n'.format(name))
+    out_file.write(' \\\n'.join(lines))
+    out_file.write('\n')
+
 def generate_unicode_stuff(unicode_data, case_folding,
-                           data_file, test_mapping, test_space, test_icase):
+                           data_file, non_bmp_file,
+                           test_mapping, test_space, test_icase):
     dummy = (0, 0, 0)
     table = [dummy]
     cache = {dummy: 0}
     index = [0] * (MAX + 1)
     folding_map = {}
     rev_folding_map = {}
     folding_dummy = (0, 0, 0, 0)
     folding_table = [folding_dummy]
     folding_cache = {folding_dummy: 0}
     folding_index = [0] * (MAX + 1)
     test_table = {}
     test_space_table = []
     folding_tests = []
     folding_codes = set()
 
+    non_bmp_lower_map = {}
+    non_bmp_upper_map = {}
+    non_bmp_folding_map = {}
+    non_bmp_rev_folding_map = {}
+
     for row in read_unicode_data(unicode_data):
         code = row[0]
         name = row[1]
         category = row[2]
         alias = row[-5]
         uppercase = row[-3]
         lowercase = row[-2]
         flags = 0
 
-        if code > MAX:
-            break
-
-        # we combine whitespace and lineterminators because in pratice we don't need them separated
-        if category == 'Zs' or code in whitespace or code in line_terminator:
-            flags |= FLAG_SPACE
-            test_space_table.append(code)
-        if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter)
-            flags |= FLAG_LETTER
-        if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart)
-            flags |= FLAG_IDENTIFIER_PART
-
         if uppercase:
             upper = int(uppercase, 16)
         else:
             upper = code
 
         if lowercase:
             lower = int(lowercase, 16)
         else:
             lower = code
 
+        if code > MAX:
+            if code != lower:
+                non_bmp_lower_map[code] = lower
+            if code != upper:
+                non_bmp_upper_map[code] = upper
+            continue
+
+        # we combine whitespace and lineterminators because in pratice we don't need them separated
+        if category == 'Zs' or code in whitespace or code in line_terminator:
+            flags |= FLAG_SPACE
+            test_space_table.append(code)
+        if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter)
+            flags |= FLAG_LETTER
+        if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart)
+            flags |= FLAG_IDENTIFIER_PART
+
         test_table[code] = (upper, lower, name, alias)
 
         up_d = upper - code
         low_d = lower - code
 
         assert up_d > -65535 and up_d < 65535
         assert low_d > -65535 and low_d < 65535
 
@@ -163,16 +215,20 @@ def generate_unicode_stuff(unicode_data,
             table.append(item)
         index[code] = i
 
     for row in read_case_folding(case_folding):
         code = row[0]
         mapping = row[2]
         folding_map[code] = mapping
 
+        if code > MAX:
+            non_bmp_folding_map[code] = mapping
+            non_bmp_rev_folding_map[mapping] = code
+
         if mapping not in rev_folding_map:
             rev_folding_map[mapping] = [code]
         else:
             rev_folding_map[mapping].append(code)
 
         folding_codes.add(code)
         folding_codes.add(mapping)
 
@@ -216,16 +272,41 @@ def generate_unicode_stuff(unicode_data,
 
         i = folding_cache.get(item)
         if i is None:
             assert item not in folding_table
             folding_cache[item] = i = len(folding_table)
             folding_table.append(item)
         folding_index[code] = i
 
+    non_bmp_file.write("""/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Generated by make_unicode.py DO NOT MODIFY */
+
+#ifndef vm_UnicodeNonBMP_h
+#define vm_UnicodeNonBMP_h
+
+""")
+
+    make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map)
+    non_bmp_file.write('\n')
+    make_non_bmp_convert_macro(non_bmp_file, 'UPPERCASE', non_bmp_upper_map)
+    non_bmp_file.write('\n')
+    make_non_bmp_convert_macro(non_bmp_file, 'CASE_FOLDING', non_bmp_folding_map)
+    non_bmp_file.write('\n')
+    make_non_bmp_convert_macro(non_bmp_file, 'REV_CASE_FOLDING', non_bmp_rev_folding_map)
+
+    non_bmp_file.write("""
+#endif /* vm_UnicodeNonBMP_h */
+""")
+
     test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
     test_mapping.write(public_domain)
     test_mapping.write('var mapping = [\n')
     for code in range(0, MAX + 1):
         entry = test_table.get(code)
 
         if entry:
             upper, lower, name, alias = entry
@@ -507,11 +588,12 @@ if __name__ == '__main__':
         reader.close()
         case_folding = open('CaseFolding.txt', 'w+')
         case_folding.write(data)
         case_folding.seek(0)
 
     print('Generating...')
     generate_unicode_stuff(unicode_data, case_folding,
         open('Unicode.cpp', 'w'),
+        open('UnicodeNonBMP.h', 'w'),
         open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'),
         open('../tests/ecma_5/String/string-space-trim.js', 'w'),
         open('../tests/ecma_6/RegExp/unicode-ignoreCase.js', 'w'))