Bug 1374149 - control streaming while decoding better. r=emk
authorJorg K <jorgk@jorgk.com>
Sat, 01 Jul 2017 09:31:55 +0200
changeset 28400 78a4fd6d79b50c334acfbae9220a92dc384be321
parent 28399 a78c4ac9f53cbb73b6dede90c3a5bb8f80e267af
child 28401 9b1083e627bce58e8cd029bd5a46e4a8bc203d44
push id1986
push userclokep@gmail.com
push dateWed, 02 Aug 2017 14:43:31 +0000
treeherdercomm-beta@b51c9adf2c9e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersemk
bugs1374149
Bug 1374149 - control streaming while decoding better. r=emk
mailnews/mime/jsmime/jsmime.js
mailnews/mime/jsmime/test/test_header.js
--- a/mailnews/mime/jsmime/jsmime.js
+++ b/mailnews/mime/jsmime/jsmime.js
@@ -610,17 +610,17 @@ function decodeRFC2047Words(headerValue)
   // 2047 token with the same charset.
   let lastCharset = '', currentDecoder = undefined;
 
   /**
    * Decode a single RFC 2047 token. This function is inline so that we can
    * easily close over the lastCharset/currentDecoder variables, needed for
    * handling bad RFC 2047 productions properly.
    */
-  function decode2047Token(token) {
+  function decode2047Token(token, isLastToken) {
     let tokenParts = token.split("?");
 
     // If it's obviously not a valid token, return false immediately.
     if (tokenParts.length != 5 || tokenParts[4] != '=')
       return false;
 
     // The charset parameter is defined in RFC 2231 to be charset or
     // charset*language. We only care about the charset here, so ignore any
@@ -654,16 +654,17 @@ function decodeRFC2047Words(headerValue)
       // malformed to begin with, so stripping the = and following input in that
       // case should not be an important loss.
       buffer = mimeutils.decode_qp(text.replace(/_/g, ' '), false)[0];
     } else {
       return false;
     }
 
     // Make the buffer be a typed array for what follows
+    let stringBuffer = buffer;
     buffer = mimeutils.stringToTypedArray(buffer);
 
     // If we cannot reuse the last decoder, flush out whatever remains.
     var output = '';
     if (charset != lastCharset && currentDecoder) {
       output += currentDecoder.decode();
       currentDecoder = null;
     }
@@ -678,28 +679,45 @@ function decodeRFC2047Words(headerValue)
         return false;
       }
     }
 
     // Convert this token with the buffer. Note the stream parameter--although
     // RFC 2047 tokens aren't supposed to break in the middle of a multibyte
     // character, a lot of software messes up and does so because it's hard not
     // to (see headeremitter.js for exactly how hard!).
-    return output + currentDecoder.decode(buffer, {stream: true});
+    // We must not stream ISO-2022-JP if the buffer switches back to
+    // the ASCII state, that is, ends in "ESC(B".
+    // Also, we shouldn't do streaming on the last token.
+    let doStreaming;
+    if (isLastToken ||
+        (charset.toUpperCase() == "ISO-2022-JP" &&
+         stringBuffer.endsWith("\x1B(B")))
+      doStreaming = {stream: false};
+    else
+      doStreaming = {stream: true};
+    return output + currentDecoder.decode(buffer, doStreaming);
   }
 
   // The first step of decoding is to split the string into RFC 2047 and
   // non-RFC 2047 tokens. RFC 2047 tokens look like the following:
   // =?charset?c?text?=, where c is one of B, b, Q, and q. The split regex does
   // some amount of semantic checking, so that malformed RFC 2047 tokens will
   // get ignored earlier.
   let components = headerValue.split(/(=\?[^?]*\?[BQbq]\?[^?]*\?=)/);
+
+  // Find last RFC 2047 token.
+  let lastRFC2047Index = -1;
+  for (let i = 0; i < components.length; i++) {
+    if (components[i].substring(0, 2) == "=?")
+      lastRFC2047Index = i;
+  }
   for (let i = 0; i < components.length; i++) {
     if (components[i].substring(0, 2) == "=?") {
-      let decoded = decode2047Token(components[i]);
+      let decoded = decode2047Token(components[i], i == lastRFC2047Index);
       if (decoded !== false) {
         // If 2047 decoding succeeded for this bit, rewrite the original value
         // with the proper decoding.
         components[i] = decoded;
 
         // We're done processing, so continue to the next link.
         continue;
       }
--- a/mailnews/mime/jsmime/test/test_header.js
+++ b/mailnews/mime/jsmime/test/test_header.js
@@ -621,16 +621,21 @@ suite('headerparser', function () {
         "Re: [Kitchen Nightmares] Meow! Gordon Ramsay Is =?ISO-8859-1?B?UEgR " +
         "lqZ VuIEhlYWQgVH rbGeOIFNob BJc RP2JzZXNzZW?= With My =?ISO-8859-1?B" +
         "?SHVzYmFuZ JzX0JhbGxzL JfU2F5c19BbXiScw==?= Baking Company Owner"],
       ["=?us-ascii?Q?=09Edward_Rosten?=", "\tEdward Rosten"],
       ["=?us-ascii?Q?=3D=3FUTF-8=3FQ=3Ff=3DC3=3DBCr=3F=3D?=",
         "=?UTF-8?Q?f=C3=BCr?="],
       // We don't decode unrecognized charsets (This one is actually UTF-8).
       ["=??B?Sy4gSC4gdm9uIFLDvGRlbg==?=", "=??B?Sy4gSC4gdm9uIFLDvGRlbg==?="],
+
+      // Test for bug 1374149 with ISO-2022-JP where we shouldn't stream
+      // if the first token ends in ESC(B.
+      // GyRCJCIbKEI= is the base64 encoding of ESC$B$"ESC(B.
+      ["=?ISO-2022-JP?B?GyRCJCIbKEI=?==?ISO-2022-JP?B?GyRCJCIbKEI=?=", "ああ"],
     ];
     header_tests.forEach(function (data) {
       arrayTest(data, function () {
         assert.deepEqual(headerparser.decodeRFC2047Words(data[0]), data[1]);
       });
     });
   });
   suite('8-bit header processing', function () {