Bug 1314037 - Part 2: Support extended Unicode escape sequences in identifiers. r=arai
authorAndré Bargull <andre.bargull@gmail.com>
Mon, 31 Oct 2016 09:58:08 -0700
changeset 351332 900466e640ca1bb85d1f01a20ee42645e03d2c7f
parent 351331 80bedfc21ee648de2878922b51e6d503d5716899
child 351333 b9cd3d1a34e0e122d4c25b3b2d1936d7980a84cb
push id6795
push userjlund@mozilla.com
push dateMon, 23 Jan 2017 14:19:46 +0000
treeherdermozilla-esr52@76101b503191 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersarai
bugs1314037
milestone52.0a1
Bug 1314037 - Part 2: Support extended Unicode escape sequences in identifiers. r=arai
js/src/frontend/TokenStream.cpp
js/src/frontend/TokenStream.h
js/src/tests/ecma_6/Syntax/identifiers-with-extended-unicode-escape.js
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@@ -752,52 +752,106 @@ TokenStream::reportAsmJSError(uint32_t o
     unsigned flags = options().throwOnAsmJSValidationFailureOption
                      ? JSREPORT_ERROR
                      : JSREPORT_WARNING;
     reportCompileErrorNumberVA(offset, flags, errorNumber, args);
     va_end(args);
 }
 
 // We have encountered a '\': check for a Unicode escape sequence after it.
-// Return 'true' and the character code point (by value) if we found a
-// Unicode escape sequence.  Otherwise, return 'false'.  In both cases, do not
-// advance along the buffer.
-bool
+// Return the length of the escape sequence and the character code point (by
+// value) if we found a Unicode escape sequence.  Otherwise, return 0.  In both
+// cases, do not advance along the buffer.
+uint32_t
 TokenStream::peekUnicodeEscape(uint32_t* codePoint)
 {
-    char16_t cp[5];
+    int32_t c = getCharIgnoreEOL();
+    if (c != 'u') {
+        ungetCharIgnoreEOL(c);
+        return 0;
+    }
 
-    if (peekChars(5, cp) && cp[0] == 'u' &&
-        JS7_ISHEX(cp[1]) && JS7_ISHEX(cp[2]) &&
-        JS7_ISHEX(cp[3]) && JS7_ISHEX(cp[4]))
+    char16_t cp[3];
+    uint32_t length;
+    c = getCharIgnoreEOL();
+    if (JS7_ISHEX(c) && peekChars(3, cp) &&
+        JS7_ISHEX(cp[0]) && JS7_ISHEX(cp[1]) && JS7_ISHEX(cp[2]))
     {
-        *codePoint = (((((JS7_UNHEX(cp[1]) << 4)
-                + JS7_UNHEX(cp[2])) << 4)
-              + JS7_UNHEX(cp[3])) << 4)
-            + JS7_UNHEX(cp[4]);
-        return true;
+        *codePoint = (JS7_UNHEX(c) << 12) |
+                     (JS7_UNHEX(cp[0]) << 8) |
+                     (JS7_UNHEX(cp[1]) << 4) |
+                     JS7_UNHEX(cp[2]);
+        length = 5;
+    } else if (c == '{') {
+        length = peekExtendedUnicodeEscape(codePoint);
+    } else {
+        length = 0;
     }
-    return false;
+
+    ungetCharIgnoreEOL(c);
+    ungetCharIgnoreEOL('u');
+    return length;
 }
 
-bool
+uint32_t
+TokenStream::peekExtendedUnicodeEscape(uint32_t* codePoint)
+{
+    // The opening brace character was already read.
+    int32_t c = getCharIgnoreEOL();
+
+    // Skip leading zeros.
+    uint32_t leadingZeros = 0;
+    while (c == '0') {
+        leadingZeros++;
+        c = getCharIgnoreEOL();
+    }
+
+    char16_t cp[6];
+    size_t i = 0;
+    uint32_t code = 0;
+    while (JS7_ISHEX(c) && i < 6) {
+        cp[i++] = c;
+        code = code << 4 | JS7_UNHEX(c);
+        c = getCharIgnoreEOL();
+    }
+
+    uint32_t length;
+    if (c == '}' && (leadingZeros > 0 || i > 0) && code <= unicode::NonBMPMax) {
+        *codePoint = code;
+        length = leadingZeros + i + 3;
+    } else {
+        length = 0;
+    }
+
+    ungetCharIgnoreEOL(c);
+    while (i--)
+        ungetCharIgnoreEOL(cp[i]);
+    while (leadingZeros--)
+        ungetCharIgnoreEOL('0');
+
+    return length;
+}
+
+uint32_t
 TokenStream::matchUnicodeEscapeIdStart(uint32_t* codePoint)
 {
-    if (peekUnicodeEscape(codePoint) && unicode::IsIdentifierStart(*codePoint)) {
-        skipChars(5);
-        return true;
+    uint32_t length = peekUnicodeEscape(codePoint);
+    if (length > 0 && unicode::IsIdentifierStart(*codePoint)) {
+        skipChars(length);
+        return length;
     }
-    return false;
+    return 0;
 }
 
 bool
 TokenStream::matchUnicodeEscapeIdent(uint32_t* codePoint)
 {
-    if (peekUnicodeEscape(codePoint) && unicode::IsIdentifierPart(*codePoint)) {
-        skipChars(5);
+    uint32_t length = peekUnicodeEscape(codePoint);
+    if (length > 0 && unicode::IsIdentifierPart(*codePoint)) {
+        skipChars(length);
         return true;
     }
     return false;
 }
 
 // Helper function which returns true if the first length(q) characters in p are
 // the same as the characters in q.
 static bool
@@ -1419,23 +1473,25 @@ TokenStream::getTokenInternal(TokenKind*
 
       case '+':
         if (matchChar('+'))
             tp->type = TOK_INC;
         else
             tp->type = matchChar('=') ? TOK_ADDASSIGN : TOK_ADD;
         goto out;
 
-      case '\\':
-        hadUnicodeEscape = matchUnicodeEscapeIdStart(&qc);
-        if (hadUnicodeEscape) {
-            identStart = userbuf.addressOfNextRawChar() - 6;
+      case '\\': {
+        uint32_t escapeLength = matchUnicodeEscapeIdStart(&qc);
+        if (escapeLength > 0) {
+            identStart = userbuf.addressOfNextRawChar() - escapeLength - 1;
+            hadUnicodeEscape = true;
             goto identifier;
         }
         goto badchar;
+      }
 
       case '|':
         if (matchChar('|'))
             tp->type = TOK_OR;
         else
             tp->type = matchChar('=') ? TOK_BITORASSIGN : TOK_BITOR;
         goto out;
 
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@@ -943,18 +943,19 @@ class MOZ_STACK_CLASS TokenStream
     MOZ_MUST_USE bool getBracedUnicode(uint32_t* code);
     MOZ_MUST_USE bool getStringOrTemplateToken(int untilChar, Token** tp);
 
     int32_t getChar();
     int32_t getCharIgnoreEOL();
     void ungetChar(int32_t c);
     void ungetCharIgnoreEOL(int32_t c);
     Token* newToken(ptrdiff_t adjust);
-    bool peekUnicodeEscape(uint32_t* codePoint);
-    bool matchUnicodeEscapeIdStart(uint32_t* codePoint);
+    uint32_t peekUnicodeEscape(uint32_t* codePoint);
+    uint32_t peekExtendedUnicodeEscape(uint32_t* codePoint);
+    uint32_t matchUnicodeEscapeIdStart(uint32_t* codePoint);
     bool matchUnicodeEscapeIdent(uint32_t* codePoint);
     bool peekChars(int n, char16_t* cp);
 
     MOZ_MUST_USE bool getDirectives(bool isMultiline, bool shouldWarnDeprecated);
     MOZ_MUST_USE bool getDirective(bool isMultiline, bool shouldWarnDeprecated,
                                    const char* directive, int directiveLength,
                                    const char* errorMsgPragma,
                                    UniquePtr<char16_t[], JS::FreePolicy>* destination);
new file mode 100644
--- /dev/null
+++ b/js/src/tests/ecma_6/Syntax/identifiers-with-extended-unicode-escape.js
@@ -0,0 +1,229 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// Simple cases, not using eval.
+{
+    let \u{61} = 123;
+    assertEq(a, 123);
+
+    let \u{6A} = 123;
+    assertEq(j, 123);
+
+    let a\u{62} = 456;
+    assertEq(ab, 456);
+
+    let \u{63}\u{6b} = 789;
+    assertEq(ck, 789);
+}
+
+const leadingZeros = [0, 1, 2, 3, 4, 100].map(c => "0".repeat(c));
+
+
+// From DerivedCoreProperties.txt (Unicode 9):
+// Derived Property: ID_Start
+//  Characters that can start an identifier.
+//  Generated from:
+//      Lu + Ll + Lt + Lm + Lo + Nl
+//    + Other_ID_Start
+//    - Pattern_Syntax
+//    - Pattern_White_Space
+const idStart = [
+    0x0041,     // LATIN CAPITAL LETTER A, Gc=Lu
+    0x006A,     // LATIN SMALL LETTER J, Gc=Ll
+    0x00C9,     // LATIN CAPITAL LETTER E WITH ACUTE, Gc=Lu
+    0x00FF,     // LATIN SMALL LETTER Y WITH DIAERESIS, Gc=Ll
+    0x01C5,     // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON, Gc=Lt
+    0x0294,     // LATIN LETTER GLOTTAL STOP, Gc=Lo
+    0x037A,     // GREEK YPOGEGRAMMENI, Gc=Lm
+    0x16EE,     // RUNIC ARLAUG SYMBOL, Gc=Nl
+    0xFF70,     // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK, Gc=Lm
+];
+
+const idStartSupplemental = [
+    0x10140,    // GREEK ACROPHONIC ATTIC ONE QUARTER, Gc=Nl
+    0x10300,    // OLD ITALIC LETTER A, Gc=Lo
+    0x10400,    // DESERET CAPITAL LETTER LONG I, Gc=Lu
+    0x10430,    // DESERET SMALL LETTER SHORT A, Gc=Ll
+    0x16B40,    // PAHAWH HMONG SIGN VOS SEEV, Gc=Lm
+];
+
+// From PropList.txt (Unicode 9):
+const otherIdStart = [
+    // Enable the following lines when Bug 1282724 is fixed.
+    // 0x1885,     // MONGOLIAN LETTER ALI GALI BALUDA, Gc=Mn
+    // 0x1886,     // MONGOLIAN LETTER ALI GALI THREE BALUDA, Gc=Mn
+    0x2118,     // SCRIPT CAPITAL P, Gc=Sm
+    0x212E,     // ESTIMATED SYMBOL, Gc=So
+    0x309B,     // KATAKANA-HIRAGANA VOICED SOUND MARK, Gc=Sk
+    0x309C,     // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK, Gc=Sk
+];
+
+// Remove this list when we support Unicode 9 (Bug 1282724).
+const otherIdStart_Unicode9 = [
+    0x1885,     // MONGOLIAN LETTER ALI GALI BALUDA, Gc=Mn
+    0x1886,     // MONGOLIAN LETTER ALI GALI THREE BALUDA, Gc=Mn
+];
+
+// From DerivedCoreProperties.txt (Unicode 9):
+// Derived Property: ID_Continue
+//  Characters that can continue an identifier.
+//  Generated from:
+//      ID_Start
+//    + Mn + Mc + Nd + Pc
+//    + Other_ID_Continue
+//    - Pattern_Syntax
+//    - Pattern_White_Space
+const idContinue = [
+    0x0030,     // DIGIT ZERO, Gc=Nd
+    0x0300,     // COMBINING GRAVE ACCENT, Gc=Mn
+    0x0660,     // ARABIC-INDIC DIGIT ZERO, Gc=Nd
+    0x0903,     // DEVANAGARI SIGN VISARGA, Gc=Mc
+    0xFF10,     // FULLWIDTH DIGIT ZERO, Gc=Nd
+    0xFF3F,     // FULLWIDTH LOW LINE, Gc=Pc
+];
+
+const idContinueSupplemental = [
+    0x101FD,    // PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE, Gc=Mn
+    0x104A0,    // OSMANYA DIGIT ZERO, Gc=Nd
+    0x11000,    // BRAHMI SIGN CANDRABINDU, Gc=Mc
+];
+
+// From PropList.txt (Unicode 9):
+const otherIdContinue = [
+    0x00B7,     // MIDDLE DOT, Gc=Po
+    0x0387,     // GREEK ANO TELEIA, Gc=Po
+    0x1369,     // ETHIOPIC DIGIT ONE, Gc=No
+    0x136A,     // ETHIOPIC DIGIT TWO, Gc=No
+    0x136B,     // ETHIOPIC DIGIT THREE, Gc=No
+    0x136C,     // ETHIOPIC DIGIT FOUR, Gc=No
+    0x136D,     // ETHIOPIC DIGIT FIVE, Gc=No
+    0x136E,     // ETHIOPIC DIGIT SIX, Gc=No
+    0x136F,     // ETHIOPIC DIGIT SEVEN, Gc=No
+    0x1370,     // ETHIOPIC DIGIT EIGHT, Gc=No
+    0x1371,     // ETHIOPIC DIGIT NINE, Gc=No
+    0x19DA,     // NEW TAI LUE THAM DIGIT ONE, Gc=No
+];
+
+for (let ident of [...idStart, ...otherIdStart_Unicode9]) {
+    for (let count of leadingZeros) {
+        let zeros = "0".repeat(count);
+        eval(`
+            let \\u{${zeros}${ident.toString(16)}} = 123;
+            assertEq(${String.fromCodePoint(ident)}, 123);
+        `);
+    }
+}
+
+// Move this to the loop above when Bug 917436 is fixed.
+for (let ident of [...idStartSupplemental, ...otherIdStart]) {
+    for (let zeros of leadingZeros) {
+        assertThrowsInstanceOf(() => eval(`\\u{${zeros}${ident.toString(16)}}`), SyntaxError);
+    }
+}
+
+for (let ident of [...idContinue, ...idContinueSupplemental, ...otherIdContinue]) {
+    for (let zeros of leadingZeros) {
+        assertThrowsInstanceOf(() => eval(`\\u{${zeros}${ident.toString(16)}}`), SyntaxError);
+    }
+}
+
+for (let ident of [...idStart, ...otherIdStart_Unicode9, ...idContinue]) {
+    for (let zeros of leadingZeros) {
+        eval(`
+            let A\\u{${zeros}${ident.toString(16)}} = 123;
+            assertEq(${String.fromCodePoint(0x41, ident)}, 123);
+        `);
+    }
+}
+
+// Move this to the loop above when Bug 917436 is fixed.
+for (let ident of [...idStartSupplemental, ...otherIdStart, ...idContinueSupplemental, ...otherIdContinue]) {
+    for (let zeros of leadingZeros) {
+        assertThrowsInstanceOf(() => eval(`\\u{${zeros}${ident.toString(16)}}`), SyntaxError);
+    }
+}
+
+
+const notIdentifiers = [
+    0x0000,     // NULL, Gc=Cc
+    0x000A,     // LINE FEED (LF), Gc=Cc
+    0x005E,     // CIRCUMFLEX ACCENT, Gc=Sk
+    0x00B1,     // PLUS-MINUS SIGN, Gc=Sm
+    0xFF61,     // HALFWIDTH IDEOGRAPHIC FULL STOP, Gc=Po
+    0x10061,    // Not assigned.
+    0x10100,    // AEGEAN WORD SEPARATOR LINE, Gc=Po
+    0x100061,   // <Plane 16 Private Use>, Gc=Co
+];
+
+for (let ident of notIdentifiers) {
+    for (let zeros of leadingZeros) {
+        assertThrowsInstanceOf(() => eval(`\\u{${zeros}${ident.toString(16)}}`), SyntaxError);
+    }
+}
+
+
+const incompleteEscapes = [
+    "\\u{",
+    "\\u{6",
+    "\\u{61",
+    "\\u{061",
+    "\\u{0061",
+    "\\u{00061",
+    "\\u{000061",
+    "\\u{0000061",
+
+    "\\u}",
+];
+for (let invalid of incompleteEscapes) {
+    // Ends with EOF.
+    assertThrowsInstanceOf(() => eval(invalid), SyntaxError);
+
+    // Ends with EOL.
+    assertThrowsInstanceOf(() => eval(invalid + "\n"), SyntaxError);
+
+    // Ends with space.
+    assertThrowsInstanceOf(() => eval(invalid + " "), SyntaxError);
+}
+
+
+const invalidEscapes = [
+    // Empty escape.
+    "",
+
+    // Not hexadecimal characters.
+    "\0",
+    "G",
+    "Z",
+    "\uFFFF",
+    "\uDBFF\uDFFF",
+
+    // Has space characters.
+    " 61",
+    "61 ",
+
+    // Has newline characters.
+    "\n61",
+    "61\n",
+
+    // Exceeds 0x10FFFF, six characters.
+    "110000",
+    "110001",
+    "fffffe",
+    "ffffff",
+
+    // Exceeds 0x10FFFF, more than six characters.
+    "10ffff0",
+    "10ffffabcdef",
+];
+
+for (let invalid of invalidEscapes) {
+    for (let zeros of leadingZeros) {
+        assertThrowsInstanceOf(() => eval(`\\u{${zeros}${invalid}}`), SyntaxError);
+        assertThrowsInstanceOf(() => eval(`var \\u{${zeros}${invalid}}`), SyntaxError);
+    }
+}
+
+
+if (typeof reportCompare === "function")
+    reportCompare(0, 0, "ok");