Bug 1673440 - Report actual illegal character when parsing JS. r=arai
authorTom Schuster <evilpies@gmail.com>
Tue, 27 Oct 2020 15:33:27 +0000
changeset 554708 e14ca19d32bac256b044cdf97a5a341333f023c6
parent 554707 42bf476bba9a12cfad1a25f9269d2b3113a2d8b2
child 554709 5362c99dfe66bb3bf7077415e99e2a45d5ac4fa3
push id37898
push userabutkovits@mozilla.com
push dateWed, 28 Oct 2020 09:24:21 +0000
treeherdermozilla-central@83bf4fd3b1fb [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersarai
bugs1673440
milestone84.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1673440 - Report actual illegal character when parsing JS. r=arai Differential Revision: https://phabricator.services.mozilla.com/D94767
js/public/friend/ErrorNumbers.msg
js/src/frontend/TokenStream.cpp
js/src/frontend/TokenStream.h
js/src/jit-test/tests/parser/syntax-error-illegal-character.js
js/src/jsapi-tests/testPrintError.cpp
js/src/tests/non262/extensions/regress-274152.js
--- a/js/public/friend/ErrorNumbers.msg
+++ b/js/public/friend/ErrorNumbers.msg
@@ -283,17 +283,17 @@ MSG_DEF(JSMSG_FOR_AWAIT_OUTSIDE_ASYNC, 0
 MSG_DEF(JSMSG_FROM_AFTER_IMPORT_CLAUSE, 0, JSEXN_SYNTAXERR, "missing keyword 'from' after import clause")
 MSG_DEF(JSMSG_FROM_AFTER_EXPORT_STAR,  0, JSEXN_SYNTAXERR, "missing keyword 'from' after export *")
 MSG_DEF(JSMSG_GARBAGE_AFTER_INPUT,     2, JSEXN_SYNTAXERR, "unexpected garbage after {0}, starting with {1}")
 MSG_DEF(JSMSG_IDSTART_AFTER_NUMBER,    0, JSEXN_SYNTAXERR, "identifier starts immediately after numeric literal")
 MSG_DEF(JSMSG_BAD_ESCAPE,              0, JSEXN_SYNTAXERR, "invalid escape sequence")
 MSG_DEF(JSMSG_MISSING_PRIVATE_NAME,    0, JSEXN_SYNTAXERR, "'#' not followed by identifier")
 MSG_DEF(JSMSG_PRIVATE_DELETE,          0, JSEXN_SYNTAXERR, "private fields can't be deleted")
 MSG_DEF(JSMSG_MISSING_PRIVATE_DECL,    1, JSEXN_SYNTAXERR, "reference to undeclared private field or method {0}")
-MSG_DEF(JSMSG_ILLEGAL_CHARACTER,       0, JSEXN_SYNTAXERR, "illegal character")
+MSG_DEF(JSMSG_ILLEGAL_CHARACTER,       1, JSEXN_SYNTAXERR, "illegal character {0}")
 MSG_DEF(JSMSG_IMPORT_META_OUTSIDE_MODULE, 0, JSEXN_SYNTAXERR, "import.meta may only appear in a module")
 MSG_DEF(JSMSG_IMPORT_DECL_AT_TOP_LEVEL, 0, JSEXN_SYNTAXERR, "import declarations may only appear at top level of a module")
 MSG_DEF(JSMSG_OF_AFTER_FOR_LOOP_DECL,  0, JSEXN_SYNTAXERR, "a declaration in the head of a for-of loop can't have an initializer")
 MSG_DEF(JSMSG_IN_AFTER_LEXICAL_FOR_DECL,0,JSEXN_SYNTAXERR, "a lexical declaration in the head of a for-in loop can't have an initializer")
 MSG_DEF(JSMSG_INVALID_FOR_IN_DECL_WITH_INIT,0,JSEXN_SYNTAXERR,"for-in loop head declarations may not have initializers")
 MSG_DEF(JSMSG_INVALID_ID,              1, JSEXN_SYNTAXERR, "{0} is an invalid identifier")
 MSG_DEF(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER, 0, JSEXN_SYNTAXERR, "numeric separators '_' are not allowed in numbers that start with '0'")
 MSG_DEF(JSMSG_LABEL_NOT_FOUND,         0, JSEXN_SYNTAXERR, "label not found")
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@@ -31,16 +31,17 @@
 #include "jsexn.h"
 #include "jsnum.h"
 
 #include "frontend/BytecodeCompiler.h"
 #include "frontend/Parser.h"
 #include "frontend/ParserAtom.h"
 #include "frontend/ReservedWords.h"
 #include "js/CharacterEncoding.h"
+#include "js/Printf.h"       // JS_smprintf
 #include "js/RegExpFlags.h"  // JS::RegExpFlags
 #include "js/UniquePtr.h"
 #include "util/StringBuffer.h"
 #include "util/Unicode.h"
 #include "vm/FrameIter.h"  // js::{,NonBuiltin}FrameIter
 #include "vm/HelperThreads.h"
 #include "vm/JSAtom.h"
 #include "vm/JSContext.h"
@@ -1824,16 +1825,27 @@ bool TokenStreamSpecific<Unit, AnyCharsA
     // Add a line of context from this TokenStream to help with debugging.
     return internalComputeLineOfContext(err, offset);
   }
 
   // We can't fill in any more here.
   return true;
 }
 
+template <typename Unit, class AnyCharsAccess>
+void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter(
+    int32_t cp) {
+  UniqueChars display = JS_smprintf("U+%04X", cp);
+  if (!display) {
+    ReportOutOfMemory(anyCharsAccess().cx);
+    return;
+  }
+  error(JSMSG_ILLEGAL_CHARACTER, display.get());
+}
+
 // We have encountered a '\': check for a Unicode escape sequence after it.
 // Return the length of the escape sequence and the encoded code point (by
 // value) if we found a Unicode escape sequence, and skip all code units
 // involed.  Otherwise, return 0 and don't advance along the buffer.
 template <typename Unit, class AnyCharsAccess>
 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
     uint32_t* codePoint) {
   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
@@ -2830,17 +2842,17 @@ MOZ_MUST_USE bool TokenStreamSpecific<Un
                    "IdentifierStart must guarantee !IsLineTerminator "
                    "or else we'll fail to maintain line-info/flags "
                    "for EOL here");
 
         return identifierName(start, identStart, IdentifierEscapes::None,
                               modifier, NameVisibility::Public, ttp);
       }
 
-      error(JSMSG_ILLEGAL_CHARACTER);
+      reportIllegalCharacter(cp);
       return badToken();
     }  // !isAsciiCodePoint(unit)
 
     consumeKnownCodeUnit(unit);
 
     // Get the token kind, based on the first char.  The ordering of c1kind
     // comparison is based on the frequency of tokens in real code:
     // Parsemark (which represents typical JS code on the web) and the
@@ -3342,17 +3354,17 @@ MOZ_MUST_USE bool TokenStreamSpecific<Un
               matchCodeUnit('=') ? TokenKind::SubAssign : TokenKind::Sub;
         }
         break;
 
       default:
         // We consumed a bad ASCII code point/unit.  Put it back so the
         // error location is the bad code point.
         ungetCodeUnit(unit);
-        error(JSMSG_ILLEGAL_CHARACTER);
+        reportIllegalCharacter(unit);
         return badToken();
     }  // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
 
     MOZ_ASSERT(simpleKind != TokenKind::Limit,
                "switch-statement should have set |simpleKind| before "
                "breaking");
 
     newSimpleToken(simpleKind, start, modifier, ttp);
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@@ -2560,16 +2560,18 @@ class MOZ_STACK_CLASS TokenStreamSpecifi
         errorAt(offset, JSMSG_UNICODE_OVERFLOW, "escape sequence");
         return;
       case InvalidEscapeType::Octal:
         errorAt(offset, JSMSG_DEPRECATED_OCTAL);
         return;
     }
   }
 
+  void reportIllegalCharacter(int32_t cp);
+
   MOZ_MUST_USE bool putIdentInCharBuffer(const Unit* identStart);
 
   using IsIntegerUnit = bool (*)(int32_t);
   MOZ_MUST_USE MOZ_ALWAYS_INLINE bool matchInteger(IsIntegerUnit isIntegerUnit,
                                                    int32_t* nextUnit);
   MOZ_MUST_USE MOZ_ALWAYS_INLINE bool matchIntegerAfterFirstDigit(
       IsIntegerUnit isIntegerUnit, int32_t* nextUnit);
 
--- a/js/src/jit-test/tests/parser/syntax-error-illegal-character.js
+++ b/js/src/jit-test/tests/parser/syntax-error-illegal-character.js
@@ -1,14 +1,20 @@
 load(libdir + "syntax.js");
 
-var JSMSG_ILLEGAL_CHARACTER = "illegal character";
-
-var postfixes = [
-  "@",
-];
+function check_syntax_error_at(e, code, name) {
+  assertEq(e instanceof SyntaxError, true, name + ": " + code);
+  assertEq(e.message, "illegal character U+0040", name + ": " + code);
+}
+test_syntax(["@"], check_syntax_error_at, false);
 
-function check_syntax_error(e, code, name) {
+function check_syntax_error_ellipsis(e, code, name) {
   assertEq(e instanceof SyntaxError, true, name + ": " + code);
-  assertEq(e.message, JSMSG_ILLEGAL_CHARACTER, name + ": " + code);
+  assertEq(e.message, "illegal character U+2026", name + ": " + code);
 }
+test_syntax(["…"], check_syntax_error_ellipsis, false);
 
-test_syntax(postfixes, check_syntax_error, false);
+function check_syntax_error_clown(e, code, name) {
+  assertEq(e instanceof SyntaxError, true, name + ": " + code);
+  assertEq(e.message, "illegal character U+1F921", name + ": " + code);
+}
+test_syntax(["🤡"], check_syntax_error_clown, false);
+
--- a/js/src/jsapi-tests/testPrintError.cpp
+++ b/js/src/jsapi-tests/testPrintError.cpp
@@ -109,17 +109,17 @@ BEGIN_TEST(testPrintError_UTF16CodePoint
   JS::ExceptionStack exnStack(cx);
   CHECK(JS::StealPendingExceptionStack(cx, &exnStack));
 
   JS::ErrorReportBuilder builder(cx);
   CHECK(builder.init(cx, exnStack, JS::ErrorReportBuilder::NoSideEffects));
   JS::PrintError(cx, buf.stream(), builder, false);
 
   CHECK(buf.contains(
-      "testPrintError_UTF16CodePoints.js:3:4 SyntaxError: illegal character:\n"
+      "testPrintError_UTF16CodePoints.js:3:4 SyntaxError: illegal character U+1F32F:\n"
       "testPrintError_UTF16CodePoints.js:3:4 " BURRITO "`; " BURRITO
       "; } f();\n"
       "testPrintError_UTF16CodePoints.js:3:4 .....^\n"));
 
   return true;
 }
 END_TEST(testPrintError_UTF16CodePoints)
 
--- a/js/src/tests/non262/extensions/regress-274152.js
+++ b/js/src/tests/non262/extensions/regress-274152.js
@@ -26,20 +26,23 @@ function test()
                             '\u0601', 
                             '\u0602', 
                             '\u0603', 
                             '\u06DD', 
                             '\u070F'];
 
   for (var i = 0; i < formatcontrolchars.length; i++)
   {
+    var char = formatcontrolchars[i];
+
     try
     {
-      eval("hi" + formatcontrolchars[i] + "there = 'howdie';");
+      eval("hi" + char + "there = 'howdie';");
     }
     catch(ex)
     {
       actual = ex + '';
     }
 
-    reportCompare(expect, actual, summary + ': ' + i);
+    var hex = char.codePointAt(0).toString(16).toUpperCase().padStart(4, '0');
+    reportCompare(`${expect} U+${hex}`, actual, summary + ': ' + i);
   }
 }