Bug 1506587 - remove unwanted characters from headers. r+a=jorgk
authorMagnus Melin <mkmelin+mozilla@iki.fi>
Thu, 18 Jul 2019 21:33:16 +0200
changeset 35252 8ed355a905feffd822c5a543344018f1c38566b6
parent 35251 0cb3f49224430ab6e0d656f22ff07ec97d3ba8ce
child 35253 cc0eaa1cd45a029e1d316d4550f824b4a9ffc25d
push id2470
push usermozilla@jorgk.com
push dateTue, 23 Jul 2019 08:41:09 +0000
treeherdercomm-beta@cc0eaa1cd45a [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
bugs1506587
Bug 1506587 - remove unwanted characters from headers. r+a=jorgk * * * Bug 1506587 - Follow-up: add comment. rs=comment-only DONTBUILD
mailnews/compose/test/unit/test_splitRecipients.js
mailnews/mime/jsmime/jsmime.js
mailnews/mime/jsmime/test/test_header.js
--- a/mailnews/compose/test/unit/test_splitRecipients.js
+++ b/mailnews/compose/test/unit/test_splitRecipients.js
@@ -59,17 +59,17 @@ var splitRecipientsTests = [
     recipients: "a@xxx.invalid; B <b@xxx.invalid>",
     emailAddressOnly: false,
     count: 2,
     result: [ "a@xxx.invalid", "B <b@xxx.invalid>" ],
   }, {
     recipients: '"A " <a@xxx.invalid>; b@xxx.invalid',
     emailAddressOnly: false,
     count: 2,
-    result: [ "A  <a@xxx.invalid>", "b@xxx.invalid" ],
+    result: [ "A <a@xxx.invalid>", "b@xxx.invalid" ],
   }, {
     recipients: "A <a@xxx.invalid>; B <b@xxx.invalid>",
     emailAddressOnly: false,
     count: 2,
     result: [ "A <a@xxx.invalid>", "B <b@xxx.invalid>" ],
   }, {
     recipients: "A (this: is, a comment;) <a.invalid>; g:   (this: is, <a> comment;) C <c.invalid>, d.invalid;",
     emailAddressOnly: false,
@@ -111,12 +111,13 @@ function run_test() {
     print("Test: " + splitRecipientsTests[part].recipients);
     var result = fields.splitRecipients(splitRecipientsTests[part].recipients,
                                         splitRecipientsTests[part].emailAddressOnly,
                                         count);
 
     Assert.equal(splitRecipientsTests[part].count, count.value);
     Assert.equal(splitRecipientsTests[part].count, result.length);
 
-    for (var item = 0; item < count.value; ++item)
+    for (var item = 0; item < count.value; ++item) {
       Assert.equal(splitRecipientsTests[part].result[item], result[item]);
+    }
   }
 }
--- a/mailnews/mime/jsmime/jsmime.js
+++ b/mailnews/mime/jsmime/jsmime.js
@@ -378,18 +378,60 @@ var headerparser = {};
  */
 /* eslint-disable complexity */
 function getHeaderTokens(value, delimiters, opts) {
   // The array of parsed tokens. This method used to be a generator, but it
   // appears that generators are poorly optimized in current engines, so it was
   // converted to not be one.
   let tokenList = [];
 
-  // Represents a non-delimiter token
+  // Represents a non-delimiter token.
   function Token(token) {
+    // Replace problematic characters so we don't get unexpected behavior
+    // down the line. These fall into a few categories:
+    // A) "Separator, space" (Zs),
+    // B) "Mark, Nonspacing" (Mn)
+    // C) "Other, Control" (Cc)
+    // D) "Other, Format" (Cf)
+    // Unfortuantely, no support for the needed regexp Unicode property escapes
+    // in our engine. So we need to hand-roll it. Used the regexpu tool for
+    // that: https://mothereff.in/regexpu.
+    // This should be updated regularly, to take into account new additions
+    // to the unicode standard. Last updated July 2019.
+    // For a full list of categories, see http://unicode.org/Public//5.0.0/ucd/UCD.html.
+
+    // -- case A: /\p{Zs}/u
+    // https://www.fileformat.info/info/unicode/category/Zs/list.htm
+    // https://mothereff.in/regexpu#input=/\p{Zs}/u&unicodePropertyEscape=1
+    token = token.replace(/[\xA0\u1680\u2000-\u200A\u202F\u205F\u3000]/g, " ");
+
+    // -- case B: /\p{Mn}/u
+    // https://www.fileformat.info/info/unicode/category/Mn/list.htm
+    // https://mothereff.in/regexpu#input=/\p{Mn}/u&unicodePropertyEscape=1
+    // This is a bit more complicated as some of them could be "real", so we'll
+    // only remove the ones that are known to show as blank.
+    token = token.replace(/[\u034F\u17B4\u17B5\u180B-\u180D\uFE00-\uFE0F]/g, "");
+    // \uE0100-\uE01EF need to be written using their surrogate code point pairs
+    // until extended Unicode escapes are supported in regexps.
+    // https://www.fileformat.info/info/unicode/char/e0100/index.htm says \uDB40\uDD00.
+    // https://www.fileformat.info/info/unicode/char/e01ef/index.htm says \uDB40\uDDEF.
+    token = token.replace(/\uDB40[\uDD00-\uDDEF]/g, "");
+
+    // -- case C: /\p{Cc}/u, except Tab/LF/CR
+    // https://www.fileformat.info/info/unicode/category/Cc/list.htm
+    // https://mothereff.in/regexpu#input=/\p{Cc}/u&unicodePropertyEscape=1
+    // eslint-disable-next-line no-control-regex
+    token = token.replace(/(?![\t\n\r])[\0-\x1F\x7F-\x9F]/g, "");
+
+    // -- case D: /\p{Cf}/u
+    // https://www.fileformat.info/info/unicode/category/Cf/list.htm
+    // https://mothereff.in/regexpu#input=/\p{Cf}/u&unicodePropertyEscape=1
+    // Remove all of these except for \u0600-\u0605.
+    token = token.replace(/(?:[\xAD\u061C\u06DD\u070F\u08E2\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\uFFF9-\uFFFB]|\uD804[\uDCBD\uDCCD]|\uD80D[\uDC30-\uDC38]|\uD82F[\uDCA0-\uDCA3]|\uD834[\uDD73-\uDD7A]|\uDB40[\uDC01\uDC20-\uDC7F])/g, "");
+
     // Unescape all quoted pairs. Any trailing \ is deleted.
     this.token = token.replace(/\\(.?)/g, "$1");
   }
   Token.prototype.toString = function() { return this.token; };
 
   // The start of the current token (e.g., atoms, strings)
   let tokenStart = undefined;
   // The set of whitespace characters, as defined by RFC 5322
@@ -818,18 +860,19 @@ function parseAddressingHeader(header, d
   function addToAddrList(displayName, addrSpec) {
     // Keep the local-part quoted if it needs to be.
     let lp = addrSpec.substring(0, addrSpec.lastIndexOf("@"));
     if (/[ !()<>\[\]:;@\\,"]/.exec(lp) !== null) {
       addrSpec = '"' + lp.replace(/([\\"])/g, "\\$1") + '"' +
                  addrSpec.substring(addrSpec.lastIndexOf("@"));
     }
 
-    // Replace consecutive whitespace in the name with a single whitespace.
-    displayName = displayName.replace(/\s\s+/g, " ");
+    // Replace all whitespace characters with a single whitespace,
+    // to avoid consecutive whitespace and also to normalize tabs and newlines.
+    displayName = displayName.replace(/\s+/g, " ").trim();
 
     if (displayName === "" && lastComment !== "") {
       // Take last comment content as the display-name.
       let offset = lastComment[0] === " " ? 2 : 1;
       displayName = lastComment.substr(offset, lastComment.length - offset - 1);
     }
     if (displayName !== "" || addrSpec !== "") {
       addrlist.push({name: displayName, email: addrSpec});
@@ -924,17 +967,17 @@ function parseAddressingHeader(header, d
       addrlist = [];
       groupName = "";
     } else {
       // This is either comment content, a quoted-string, or some span of
       // dots and atoms.
 
       // Ignore the needs space if we're a "close" delimiter token.
       let spacedToken = token;
-      if (needsSpace && token.toString()[0] != ".")
+      if (needsSpace && (token.toString().length > 0) && token.toString()[0] != ".")
         spacedToken = " " + spacedToken;
 
       // Which field do we add this data to?
       if (inComment) {
         comment += spacedToken;
       } else if (inAngle) {
         address += spacedToken;
       } else {
--- a/mailnews/mime/jsmime/test/test_header.js
+++ b/mailnews/mime/jsmime/test/test_header.js
@@ -256,17 +256,16 @@ suite("headerparser", function() {
       ["Undisclosed recipients:;",
         [{name: "Undisclosed recipients", group: []}]],
       ["\" \"@a a;b",
         [{name: "", email: "\" \"@a a"},
          {name: "b", email: ""}]],
       ["Undisclosed recipients:;\0:; foo <ghj@veryveryveryverylongveryveryver" +
         "yveryinvalidaddress.invalid>",
         [{name: "Undisclosed recipients", group: []},
-         {name: "\0", group: []},
          {name: "foo", email: "ghj@veryveryveryverylongveryveryveryveryinvali" +
            "daddress.invalid"}]],
       // XXX: test_nsIMsgHeaderParser2 has an empty one here...
       ["<a;a@invalid",
         [{name: "", email: "a"}, {name: "", email: "a@invalid"}]],
       ["me@foo.invalid", [{name: "", email: "me@foo.invalid"}]],
       ["me@foo.invalid, me2@foo.invalid",
         [{name: "", email: "me@foo.invalid"},
@@ -296,17 +295,17 @@ suite("headerparser", function() {
         [{name: "Undisclosed recipients", group: []}]],
       ["a@xxx.invalid; b@xxx.invalid",
         [{name: "", email: "a@xxx.invalid"},
          {name: "", email: "b@xxx.invalid"}]],
       ["a@xxx.invalid; B <b@xxx.invalid>",
         [{name: "", email: "a@xxx.invalid"},
          {name: "B", email: "b@xxx.invalid"}]],
       ['"A " <a@xxx.invalid>; b@xxx.invalid',
-        [{name: "A ", email: "a@xxx.invalid"},
+        [{name: "A", email: "a@xxx.invalid"},
          {name: "", email: "b@xxx.invalid"}]],
       ["A <a@xxx.invalid>; B <b@xxx.invalid>",
         [{name: "A", email: "a@xxx.invalid"},
          {name: "B", email: "b@xxx.invalid"}]],
       ["A (this: is, a comment;) <a.invalid>; g:   (this: is, <a> comment;) C" +
         "<c.invalid>, d.invalid;",
         [{name: "A (this: is, a comment;)", email: "a.invalid"},
          {name: "g", group: [
@@ -355,19 +354,25 @@ suite("headerparser", function() {
         [{name: "(c9(c10)c11)", email: "a@b.d"}]],
       ["(c3)a(c4)@(c5)b(c6).(c7)d(c8)(c9(c10)c11)",
         [{name: "c9(c10)c11", email: "a@b.d"}]],
       ["(c1)n(c2) <(c3)a(c4)@(c5)b(c6).(c7)d(c8)> (c9(c10)c11)(c12)",
         [{name: "(c1) n (c2) (c9(c10)c11) (c12)", email: "a@b.d"}]],
       ["<(c3)a(c4)@(c5)b(c6).(c7)d(c8)> (c9(c10)c11)(c12)",
         [{name: "(c9(c10)c11) (c12)", email: "a@b.d"}]],
       ["(c3)a(c4)@(c5)b(c6).(c7)d(c8)(c9(c10)c11)(c12)", [{name: "c12", email: "a@b.d"}]],
-      // Collapse extraneous whitespace.
-      ["Friend \"<friend@huhu.com>\"                                \t <ws@example.com>",
+      // Collapse extraneous whitespace and make sure unexpected characters aren't there.
+      ["Friend \"<friend@huhu.com>\" \t \t  \u00A0\u00A0\u2003 \u00AD \x20\u200B\x20\u200B\x20 \t \u034F \u2028\ \uDB40\uDD01 \t <ws@example.com>",
         [{name: "Friend <friend@huhu.com>", email: "ws@example.com"}]],
+      // Collapse multiple "special" spaces like NBSP (\u00A0), EM space (\u2003), etc.
+      ["Foe \u00A0\u00A0\u2003 A <foe@example.com>",
+        [{name: "Foe A", email: "foe@example.com"}]],
+      // Remove tabs.
+      ["Tabby \t \t A\t\tB <tab@example.com>",
+        [{name: "Tabby A B", email: "tab@example.com"}]],
     ];
     header_tests.forEach(function(data) {
       arrayTest(data, function() {
         assert.deepEqual(headerparser.parseAddressingHeader(data[0], false),
           data[1]);
       });
     });
   });