Bug 1157277 - Part 1: Generate macros for non-BMP lowercase/uppercase/folding with make_unicode.py. r=till
authorTooru Fujisawa <arai_a@mac.com>
Wed, 20 Jul 2016 14:11:35 +0900
changeset 330849 0a822e7f78cbb140a6172b134ba05b4c0a83e6e4
parent 330848 87c0b92cae4244ff7f0156f1fc90cb161f9243e9
child 330850 de1cf380b1d55c91ce5bd7c07f917510fa98a55e
push id9858
push userjlund@mozilla.com
push dateMon, 01 Aug 2016 14:37:10 +0000
treeherdermozilla-aurora@203106ef6cb6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstill
bugs1157277
milestone50.0a1
Bug 1157277 - Part 1: Generate macros for non-BMP lowercase/uppercase/folding with make_unicode.py. r=till
js/src/irregexp/RegExpParser.cpp
js/src/vm/Unicode.h
js/src/vm/UnicodeNonBMP.h
js/src/vm/make_unicode.py
--- a/js/src/irregexp/RegExpParser.cpp
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -868,16 +868,17 @@ UnicodeRangesAtom(LifoAlloc* alloc,
                   bool ignore_case)
 {
     // Calculate case folding for non-BMP first and negate the range if needed.
     if (ignore_case) {
         WideCharRangeVector* tmp_wide_ranges = nullptr;
 #define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
         CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges);
         FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC)
+        FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_CALC)
 #undef CALL_CALC
 
         if (tmp_wide_ranges) {
             for (size_t i = 0; i < tmp_wide_ranges->length(); i++)
                 wide_ranges->append((*tmp_wide_ranges)[i]);
         }
     }
 
@@ -1309,16 +1310,17 @@ CaseFoldingSurrogatePairAtom(LifoAlloc* 
 static inline RegExpTree*
 SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case)
 {
     if (ignore_case) {
 #define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
         if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \
             return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF);
         FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM)
+        FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_ATOM)
 #undef CALL_ATOM
     }
 
     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
     builder->AddCharacter(lead);
     builder->AddCharacter(trail);
     return builder->ToRegExp();
 }
--- a/js/src/vm/Unicode.h
+++ b/js/src/vm/Unicode.h
@@ -3,16 +3,17 @@
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef vm_Unicode_h
 #define vm_Unicode_h
 
 #include "jspubtd.h"
+#include "vm/UnicodeNonBMP.h"
 
 extern const bool js_isidstart[];
 extern const bool js_isident[];
 extern const bool js_isspace[];
 
 namespace js {
 namespace unicode {
 
@@ -319,17 +320,9 @@ UTF16Decode(size_t lead, size_t trail)
     MOZ_ASSERT(IsTrailSurrogate(trail));
 
     return (lead - LeadSurrogateMin) * 1024 + (trail - TrailSurrogateMin) + NonBMPMin;
 }
 
 } /* namespace unicode */
 } /* namespace js */
 
-#define FOR_EACH_NON_BMP_CASE_FOLDING(macro)                            \
-    macro(0x10400, 0x10427, 0xD801, 0xDC00, 0xDC27, 0x28)               \
-    macro(0x10428, 0x1044F, 0xD801, 0xDC28, 0xDC4F, -0x28)              \
-    macro(0x10C80, 0x10CB2, 0xD803, 0xDC80, 0xDCB2, 0x40)               \
-    macro(0x10CC0, 0x10CF2, 0xD803, 0xDCC0, 0xDCF2, -0x40)              \
-    macro(0x118A0, 0x118bf, 0xD806, 0xDCA0, 0xDCBF, 0x20)               \
-    macro(0x118C0, 0x118df, 0xD806, 0xDCC0, 0xDCDF, -0x20)
-
 #endif /* vm_Unicode_h */
new file mode 100644
--- /dev/null
+++ b/js/src/vm/UnicodeNonBMP.h
@@ -0,0 +1,28 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Generated by make_unicode.py DO NOT MODIFY */
+
+#ifndef vm_UnicodeNonBMP_h
+#define vm_UnicodeNonBMP_h
+
+#define FOR_EACH_NON_BMP_LOWERCASE(macro) \
+    macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40)
+
+#define FOR_EACH_NON_BMP_UPPERCASE(macro) \
+    macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40)
+
+#define FOR_EACH_NON_BMP_CASE_FOLDING(macro) \
+    macro(0x10400, 0x10427, 0xd801, 0xdc00, 0xdc27, 40) \
+    macro(0x10c80, 0x10cb2, 0xd803, 0xdc80, 0xdcb2, 64) \
+    macro(0x118a0, 0x118bf, 0xd806, 0xdca0, 0xdcbf, 32)
+
+#define FOR_EACH_NON_BMP_REV_CASE_FOLDING(macro) \
+    macro(0x10428, 0x1044f, 0xd801, 0xdc28, 0xdc4f, -40) \
+    macro(0x10cc0, 0x10cf2, 0xd803, 0xdcc0, 0xdcf2, -64) \
+    macro(0x118c0, 0x118df, 0xd806, 0xdcc0, 0xdcdf, -32)
+
+#endif /* vm_UnicodeNonBMP_h */
--- a/js/src/vm/make_unicode.py
+++ b/js/src/vm/make_unicode.py
@@ -90,64 +90,116 @@ def read_case_folding(case_folding):
             continue
         row = line.split('; ')
         if row[1] in ['F', 'T']:
             continue
         row[0] = int(row[0], 16)
         row[2] = int(row[2], 16)
         yield row
 
+def utf16_encode(code):
+    NonBMPMin = 0x10000
+    LeadSurrogateMin = 0xD800
+    TrailSurrogateMin = 0xDC00
+
+    lead = (code - NonBMPMin) / 1024 + LeadSurrogateMin
+    trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin
+
+    return lead, trail
+
+def make_non_bmp_convert_macro(out_file, name, convert_map):
+    convert_list = []
+    entry = None
+    for code in sorted(convert_map.keys()):
+        converted = convert_map[code]
+        diff = converted - code
+
+        if entry and code == entry['code'] + entry['length'] and diff == entry['diff']:
+            entry['length'] += 1
+            continue
+
+        entry = { 'code': code, 'diff': diff, 'length': 1 }
+        convert_list.append(entry)
+
+    lines = []
+    for entry in convert_list:
+        from_code = entry['code']
+        to_code = entry['code'] + entry['length'] - 1
+        diff = entry['diff']
+
+        from_lead, from_trail = utf16_encode(from_code)
+        to_lead, to_trail = utf16_encode(to_code)
+
+        assert from_lead == to_lead
+
+        lines.append('    macro(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})'.format(
+            from_code, to_code, from_lead, from_trail, to_trail, diff))
+
+    out_file.write('#define FOR_EACH_NON_BMP_{}(macro) \\\n'.format(name))
+    out_file.write(' \\\n'.join(lines))
+    out_file.write('\n')
+
 def generate_unicode_stuff(unicode_data, case_folding,
-                           data_file, test_mapping, test_space, test_icase):
+                           data_file, non_bmp_file,
+                           test_mapping, test_space, test_icase):
     dummy = (0, 0, 0)
     table = [dummy]
     cache = {dummy: 0}
     index = [0] * (MAX + 1)
     folding_map = {}
     rev_folding_map = {}
     folding_dummy = (0, 0, 0, 0)
     folding_table = [folding_dummy]
     folding_cache = {folding_dummy: 0}
     folding_index = [0] * (MAX + 1)
     test_table = {}
     test_space_table = []
     folding_tests = []
     folding_codes = set()
 
+    non_bmp_lower_map = {}
+    non_bmp_upper_map = {}
+    non_bmp_folding_map = {}
+    non_bmp_rev_folding_map = {}
+
     for row in read_unicode_data(unicode_data):
         code = row[0]
         name = row[1]
         category = row[2]
         alias = row[-5]
         uppercase = row[-3]
         lowercase = row[-2]
         flags = 0
 
-        if code > MAX:
-            break
-
-        # we combine whitespace and lineterminators because in pratice we don't need them separated
-        if category == 'Zs' or code in whitespace or code in line_terminator:
-            flags |= FLAG_SPACE
-            test_space_table.append(code)
-        if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter)
-            flags |= FLAG_LETTER
-        if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart)
-            flags |= FLAG_IDENTIFIER_PART
-
         if uppercase:
             upper = int(uppercase, 16)
         else:
             upper = code
 
         if lowercase:
             lower = int(lowercase, 16)
         else:
             lower = code
 
+        if code > MAX:
+            if code != lower:
+                non_bmp_lower_map[code] = lower
+            if code != upper:
+                non_bmp_upper_map[code] = upper
+            continue
+
+        # we combine whitespace and lineterminators because in pratice we don't need them separated
+        if category == 'Zs' or code in whitespace or code in line_terminator:
+            flags |= FLAG_SPACE
+            test_space_table.append(code)
+        if category in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']: # $ 7.6 (UnicodeLetter)
+            flags |= FLAG_LETTER
+        if category in ['Mn', 'Mc', 'Nd', 'Pc'] or code == ZWNJ or code == ZWJ: # $ 7.6 (IdentifierPart)
+            flags |= FLAG_IDENTIFIER_PART
+
         test_table[code] = (upper, lower, name, alias)
 
         up_d = upper - code
         low_d = lower - code
 
         assert up_d > -65535 and up_d < 65535
         assert low_d > -65535 and low_d < 65535
 
@@ -163,16 +215,20 @@ def generate_unicode_stuff(unicode_data,
             table.append(item)
         index[code] = i
 
     for row in read_case_folding(case_folding):
         code = row[0]
         mapping = row[2]
         folding_map[code] = mapping
 
+        if code > MAX:
+            non_bmp_folding_map[code] = mapping
+            non_bmp_rev_folding_map[mapping] = code
+
         if mapping not in rev_folding_map:
             rev_folding_map[mapping] = [code]
         else:
             rev_folding_map[mapping].append(code)
 
         folding_codes.add(code)
         folding_codes.add(mapping)
 
@@ -216,16 +272,41 @@ def generate_unicode_stuff(unicode_data,
 
         i = folding_cache.get(item)
         if i is None:
             assert item not in folding_table
             folding_cache[item] = i = len(folding_table)
             folding_table.append(item)
         folding_index[code] = i
 
+    non_bmp_file.write("""/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Generated by make_unicode.py DO NOT MODIFY */
+
+#ifndef vm_UnicodeNonBMP_h
+#define vm_UnicodeNonBMP_h
+
+""")
+
+    make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map)
+    non_bmp_file.write('\n')
+    make_non_bmp_convert_macro(non_bmp_file, 'UPPERCASE', non_bmp_upper_map)
+    non_bmp_file.write('\n')
+    make_non_bmp_convert_macro(non_bmp_file, 'CASE_FOLDING', non_bmp_folding_map)
+    non_bmp_file.write('\n')
+    make_non_bmp_convert_macro(non_bmp_file, 'REV_CASE_FOLDING', non_bmp_rev_folding_map)
+
+    non_bmp_file.write("""
+#endif /* vm_UnicodeNonBMP_h */
+""")
+
     test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
     test_mapping.write(public_domain)
     test_mapping.write('var mapping = [\n')
     for code in range(0, MAX + 1):
         entry = test_table.get(code)
 
         if entry:
             upper, lower, name, alias = entry
@@ -507,11 +588,12 @@ if __name__ == '__main__':
         reader.close()
         case_folding = open('CaseFolding.txt', 'w+')
         case_folding.write(data)
         case_folding.seek(0)
 
     print('Generating...')
     generate_unicode_stuff(unicode_data, case_folding,
         open('Unicode.cpp', 'w'),
+        open('UnicodeNonBMP.h', 'w'),
         open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'),
         open('../tests/ecma_5/String/string-space-trim.js', 'w'),
         open('../tests/ecma_6/RegExp/unicode-ignoreCase.js', 'w'))