Bug 1515254 - Make mbox->maildir conversion 8-bit safe. r=mkmelin
authorBen Campbell <benc@thunderbird.net>
Wed, 25 Mar 2020 12:45:06 +0200
changeset 29064 b8cc5965aca01d602b082a37414fbb30d432e307
parent 29063 47ff041ec6965ada4b00104269a5954400cb8dde
child 29065 4a0bd51ee9c0627be85fbe59821f442121c0c30f
push id17184
push usermkmelin@iki.fi
push dateWed, 25 Mar 2020 10:47:45 +0000
treeherdercomm-central@8ca637b20336 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmkmelin
bugs1515254
Bug 1515254 - Make mbox->maildir conversion 8-bit safe. r=mkmelin
mailnews/base/test/unit/test_mailstoreConverter.js
mailnews/base/util/converterWorker.js
--- a/mailnews/base/test/unit/test_mailstoreConverter.js
+++ b/mailnews/base/test/unit/test_mailstoreConverter.js
@@ -21,29 +21,27 @@ let testEmails = [
   "../../../data/04-HTML+attachment.eml",
   "../../../data/05-HTML+embedded-image.eml",
   "../../../data/06-plaintext+HMTL.eml",
   "../../../data/07-plaintext+(HTML+embedded-image).eml",
   "../../../data/08-plaintext+HTML+attachment.eml",
   "../../../data/09-(HTML+embedded-image)+attachment.eml",
   "../../../data/10-plaintext+(HTML+embedded-image)+attachment.eml",
 
-  // XXX TODO: 12 and 20 get screwed up! Bug 1515254
-
   // Bodies with non-ASCII characters in UTF-8 and other charsets.
   "../../../data/11-plaintext.eml",
-  // "../../../data/12-plaintext+attachment.eml",  // using ISO-8859-7 (Greek)
+  "../../../data/12-plaintext+attachment.eml", // using ISO-8859-7 (Greek)
   "../../../data/13-HTML.eml",
   "../../../data/14-HTML+attachment.eml",
   "../../../data/15-HTML+embedded-image.eml",
   "../../../data/16-plaintext+HMTL.eml", // text part is base64 encoded
   "../../../data/17-plaintext+(HTML+embedded-image).eml", // HTML part is base64 encoded
   "../../../data/18-plaintext+HTML+attachment.eml",
   "../../../data/19-(HTML+embedded-image)+attachment.eml",
-  // "../../../data/20-plaintext+(HTML+embedded-image)+attachment.eml",  // using windows-1252
+  "../../../data/20-plaintext+(HTML+embedded-image)+attachment.eml", // using windows-1252
 
   // Bodies with non-ASCII characters in UTF-8 and other charsets, all encoded with quoted printable.
   "../../../data/21-plaintext.eml",
   "../../../data/22-plaintext+attachment.eml", // using ISO-8859-7 (Greek)
   "../../../data/23-HTML.eml",
   "../../../data/24-HTML+attachment.eml",
   "../../../data/25-HTML+embedded-image.eml",
   "../../../data/26-plaintext+HMTL.eml", // text part is base64 encoded
--- a/mailnews/base/util/converterWorker.js
+++ b/mailnews/base/util/converterWorker.js
@@ -166,20 +166,17 @@ function maildirToMBox(maildir, mboxFile
 function mboxToMaildir(mboxPath, maildirPath, progressFn) {
   // Create the maildir structure.
   OS.File.makeDir(maildirPath);
   let curDirPath = OS.Path.join(maildirPath, "cur");
   let tmpDirPath = OS.Path.join(maildirPath, "tmp");
   OS.File.makeDir(curDirPath);
   OS.File.makeDir(tmpDirPath);
 
-  let decoder = new TextDecoder();
-  let encoder = new TextEncoder();
-
-  const CHUNK_SIZE = 10000000;
+  const CHUNK_SIZE = 1000000;
   // SAFE_MARGIN is how much to keep back between chunks in order to
   // cope with separator lines which might span chunks.
   const SAFE_MARGIN = 100;
 
   // A regexp to match mbox separator lines.
   // We support lines like:
   // "From "
   // "From MAILER-DAEMON Fri Jul  8 12:08:34 2011"
@@ -191,42 +188,85 @@ function mboxToMaildir(mboxPath, maildir
   // it can be removed from the input.
   let sepRE = /^((?:From \r?\n)|(?:From [\S]+ \S{3} \S{3} [ \d]\d \d\d:\d\d:\d\d \d{4}\r?\n))[\x21-\x7E]+:/gm;
 
   // Use timestamp as starting name for output messages, incrementing
   // by one for each.
   let ident = Date.now();
   let outFile = null;
 
-  let writeToMsg = function(text) {
+  /**
+   * Helper. Convert a string into a Uint8Array, using no encoding. The low
+   * byte of each 16 bit character will be used, the high byte discarded.
+   *
+   * @param {string} s - Input string with chars in 0-255 range.
+   * @returns {Uint8Array} The output bytes.
+   */
+  let stringToBytes = function(str) {
+    var bytes = new Uint8Array(str.length);
+    for (let i = 0; i < str.length; i++) {
+      bytes[i] = str.charCodeAt(i);
+    }
+    return bytes;
+  };
+
+  /**
+   * Helper. Convert a Uint8Array directly into a string, using each byte
+   * directly as a character code. So all characters in the resulting string
+   * will range from 0 to 255, even though they are 16 bit values.
+   *
+   * @param {Uint8Array} bytes - The bytes to convert.
+   * @returns {string} The byte values in string form.
+   */
+  let bytesToString = function(bytes) {
+    return bytes.reduce(function(str, b) {
+      return str + String.fromCharCode(b);
+    }, "");
+  };
+
+  /**
+   * Helper. Write out a block of bytes to the current message file, starting
+   * a new file if required.
+   *
+   * @param {string} str - The bytes to append (as chars in range 0-255).
+   */
+  let writeToMsg = function(str) {
     if (!outFile) {
       let outPath = OS.Path.join(curDirPath, ident.toString() + ".eml");
       ident += 1;
       outFile = OS.File.open(outPath, { write: true, create: true }, {});
     }
-    let raw = encoder.encode(text);
+    // We know that str is really raw 8-bit data, not UTF-16. So we can
+    // discard the upper byte and just keep the low byte of each char.
+    let raw = stringToBytes(str);
     outFile.write(raw);
-    // for mbox->maildir conversion, progress measured in bytes
+    // For mbox->maildir conversion, progress is measured in bytes.
     progressFn(raw.byteLength);
   };
 
+  /**
+   * Helper. Close the current message file, if any.
+   */
   let closeExistingMsg = function() {
     if (outFile) {
       outFile.close();
       outFile = null;
     }
   };
 
   let mboxFile = OS.File.open(mboxPath);
   let buf = "";
   let eof = false;
   while (!eof) {
-    let raw = mboxFile.read(CHUNK_SIZE);
-    buf = buf + decoder.decode(raw);
-    eof = raw.byteLength < CHUNK_SIZE;
+    let rawBytes = mboxFile.read(CHUNK_SIZE);
+    // We're using JavaScript strings (which hold 16bit characters) to store
+    // 8 bit data. This sucks, but is faster than trying to operate directly
+    // upon Uint8Arrays. A lot of work goes into optimising JavaScript strings.
+    buf += bytesToString(rawBytes);
+    eof = rawBytes.byteLength < CHUNK_SIZE;
 
     let pos = 0;
     sepRE.lastIndex = 0; // start at beginning of buf
     let m = null;
     while ((m = sepRE.exec(buf)) !== null) {
       // Output everything up to the line separator.
       if (m.index > pos) {
         writeToMsg(buf.substring(pos, m.index));