Bug 1434020 - make body search also look in bodies of attached messages. r=aceman
authorJorg K <jorgk@jorgk.com>
Tue, 06 Feb 2018 01:33:38 +0100
changeset 31048 c6a63bb68388e85549f7d8d7755da805df78aa11
parent 31047 63f09d10244cd7100cb5955a17993160fa180937
child 31049 08ae795fa05906491189f056ea3c4d8fca0dfe83
push id383
push userclokep@gmail.com
push dateMon, 07 May 2018 21:52:48 +0000
reviewersaceman
bugs1434020
Bug 1434020 - make body search also look in bodies of attached messages. r=aceman
mailnews/base/search/public/nsMsgBodyHandler.h
mailnews/base/search/src/nsMsgBodyHandler.cpp
mailnews/base/test/unit/test_searchBody.js
mailnews/test/data/multipart-message-1.eml
mailnews/test/data/multipart-message-2.eml
mailnews/test/data/multipart-message-3.eml
mailnews/test/data/multipart-message-4.eml
--- a/mailnews/base/search/public/nsMsgBodyHandler.h
+++ b/mailnews/base/search/public/nsMsgBodyHandler.h
@@ -92,16 +92,17 @@ protected:
   bool m_stripHeaders;    // true if we're supposed to strip of message headers
   bool m_stripHtml;       // true if we're supposed to strip off HTML tags
   bool m_pastMsgHeaders;  // true if we've already skipped over the message headers
   bool m_pastPartHeaders; // true if we've already skipped over the part headers
   bool m_partIsHtml;      // true if the Content-type header claims text/html
   bool m_base64part;      // true if the current part is in base64
   bool m_isMultipart;     // true if the message is a multipart/* message
   bool m_partIsText;      // true if the current part is text/*
+  bool m_inMessageAttachment; // true if current part is message/*
 
   nsTArray<nsCString> m_boundaries;  // The boundary strings to look for
   nsCString m_partCharset; // The charset found in the part
 
   // See implementation for comments
   int32_t ApplyTransformations (const nsCString &line, int32_t length,
                                 bool &returnThisLine, nsCString &buf);
   void SniffPossibleMIMEHeader (const nsCString &line);
--- a/mailnews/base/search/src/nsMsgBodyHandler.cpp
+++ b/mailnews/base/search/src/nsMsgBodyHandler.cpp
@@ -73,19 +73,20 @@ void nsMsgBodyHandler::Initialize()
 // common initialization code regardless of what body type we are handling...
 {
   // Default transformations for local message search and MAPI access
   m_stripHeaders = true;
   m_stripHtml = true;
   m_partIsHtml = false;
   m_base64part = false;
   m_isMultipart = false;
-  m_partIsText = true; // default is text/plain
+  m_partIsText = true; // Default is text/plain, maybe proven otherwise later.
   m_pastMsgHeaders = false;
   m_pastPartHeaders = false;
+  m_inMessageAttachment = false;
   m_headerBytesRead = 0;
 }
 
 nsMsgBodyHandler::~nsMsgBodyHandler()
 {
 }
 
 int32_t nsMsgBodyHandler::GetNextLine (nsCString &buf, nsCString &charset)
@@ -236,18 +237,27 @@ int32_t nsMsgBodyHandler::ApplyTransform
       eatThisLine = true;
 
     // We have already grabbed all worthwhile information from the headers,
     // so there is no need to keep track of the current lines
     buf.Assign(line);
 
     SniffPossibleMIMEHeader(buf);
 
-    m_pastPartHeaders = buf.IsEmpty() || buf.First() == '\r' ||
-      buf.First() == '\n';
+    if (buf.IsEmpty() || buf.First() == '\r' || buf.First() == '\n') {
+      if (!m_inMessageAttachment) {
+        m_pastPartHeaders = true;
+      } else {
+        // We're in a message attachment and have just read past the
+        // part header for the attached message. We now need to read
+        // the message headers and any part headers.
+        // We can now forget about the special handling of attached messages.
+        m_inMessageAttachment = false;
+      }
+    }
 
     // We set m_pastMsgHeaders to 'true' only once.
     if (m_pastPartHeaders)
       m_pastMsgHeaders = true;
 
     return length;
   }
 
@@ -374,43 +384,42 @@ void nsMsgBodyHandler::SniffPossibleMIME
 
   if (StringBeginsWith(lowerCaseLine, NS_LITERAL_CSTRING("content-type:")))
   {
     if (lowerCaseLine.Find("text/html", /* ignoreCase = */ true) != -1)
     {
       m_partIsText = true;
       m_partIsHtml = true;
     }
-    // Strenuous edge case: a message/rfc822 is equivalent to the content type
-    // of whatever the message is. Headers should be ignored here. Even more
-    // strenuous are message/partial and message/external-body, where the first
-    // case requires reassembly across messages and the second is actually an
-    // external source. And of course, there are other message types to handle.
-    // RFC 3798 complicates things with the message/disposition-notification
-    // MIME type. message/rfc822 is best treated as a multipart with no proper
-    // boundary; since we only use boundaries for retriggering the headers,
-    // the lack of one can safely be ignored.
-    else if (lowerCaseLine.Find("multipart/", /* ignoreCase = */ true) != -1 ||
-             lowerCaseLine.Find("message/", /* ignoreCase = */ true) != -1)
+    else if (lowerCaseLine.Find("multipart/", /* ignoreCase = */ true) != -1)
     {
       if (m_isMultipart)
       {
         // Nested multipart, get ready for new headers.
         m_base64part = false;
         m_pastPartHeaders = false;
         m_partIsHtml = false;
         m_partIsText = false;
       }
       m_isMultipart = true;
       m_partCharset.Truncate();
     }
+    else if (lowerCaseLine.Find("message/", /* ignoreCase = */ true) != -1)
+    {
+      // Initialise again.
+      m_base64part = false;
+      m_pastPartHeaders = false;
+      m_partIsHtml = false;
+      m_partIsText = true;  // Default is text/plain, maybe proven otherwise later.
+      m_inMessageAttachment = true;
+    }
     else if (lowerCaseLine.Find("text/", /* ignoreCase = */ true) != -1)
       m_partIsText = true;
     else if (lowerCaseLine.Find("text/", /* ignoreCase = */ true) == -1)
-      m_partIsText = false; // We have disproved our assumption
+      m_partIsText = false; // We have disproven our assumption.
   }
 
   int32_t start;
   if (m_isMultipart &&
       (start = lowerCaseLine.Find("boundary=", /* ignoreCase = */ true)) != -1)
   {
     start += 9;  // strlen("boundary=")
     if (line[start] == '\"')
--- a/mailnews/base/test/unit/test_searchBody.js
+++ b/mailnews/base/test/unit/test_searchBody.js
@@ -65,17 +65,23 @@ var Files =
   "../../../data/22-plaintext+attachment.eml",  // using ISO-8859-7 (Greek)
   "../../../data/23-HTML.eml",
   "../../../data/24-HTML+attachment.eml",
   "../../../data/25-HTML+embedded-image.eml",
   "../../../data/26-plaintext+HMTL.eml",                   // text part is base64 encoded
   "../../../data/27-plaintext+(HTML+embedded-image).eml",  // HTML part is base64 encoded
   "../../../data/28-plaintext+HTML+attachment.eml",
   "../../../data/29-(HTML+embedded-image)+attachment.eml",
-  "../../../data/30-plaintext+(HTML+embedded-image)+attachment.eml"  // using windows-1252
+  "../../../data/30-plaintext+(HTML+embedded-image)+attachment.eml",  // using windows-1252
+
+  // Messages with message attachments, Content-Type: message/rfc822.
+  "../../../data/multipart-message-1.eml",  // plaintext, has "bodyOfAttachedMessagePlain"
+  "../../../data/multipart-message-2.eml",  // plaintext, base64, non-ASCII, has "bodyOfAttachedMessagePläin"
+  "../../../data/multipart-message-3.eml",  // plaintext+HTML, non-ASCII in plaintext, has "bodyOfAttachedMessagePläin"
+  "../../../data/multipart-message-4.eml",  // plaintext+HTML, non-ASCII in HTML, has "bodyOfAttachedMessägeHTML"
 ]
 var Tests =
 [
   /* Translate Base64 messages */
   // "World!" is contained in three messages, but in bug132340 it's not in a text
   // part and should not be found.
   { value: "World!", op: Contains, count: 2 },
   /* Don't match the base64 text */
@@ -113,16 +119,24 @@ var Tests =
 
   // Messages 21 and 23 to 30 contain "höhö" once.
   { value: "höhö", op: Contains, count: 9 },
   // Message 22 contains Καλημέρα (good morning in Greek).
   { value: "Καλημέρα", op: Contains, count: 1 },
 
   // Messages 16, 17, 18, 20 contain "hähä" in the plaintext part.
   { value: "hähä", op: Contains, count: 4 },
+
+  // The four messages with message/rfc822 attachment contain "bodyOfAttachedMessagePlain"
+  // or "bodyOfAttachedMessagePläin" in the plaintext part and "bodyOfAttachedMessageHTML"
+  // or "bodyOfAttachedMessägeHTML" in the HTML part.
+  { value: "bodyOfAttachedMessagePlain", op: Contains, count: 2 },
+  { value: "bodyOfAttachedMessagePläin", op: Contains, count: 2 },
+  { value: "bodyOfAttachedMessageHTML", op: Contains, count: 1 },
+  { value: "bodyOfAttachedMessägeHTML", op: Contains, count: 1 },
 ];
 
 function fixFile(file) {
   var fstream = Cc["@mozilla.org/network/file-input-stream;1"]
                   .createInstance(Ci.nsIFileInputStream);
   fstream.init(file, -1, -1, Ci.nsIFileInputStream.CLOSE_ON_EOF);
   var sstream = Cc["@mozilla.org/scriptableinputstream;1"]
                   .createInstance(Ci.nsIScriptableInputStream);
new file mode 100644
--- /dev/null
+++ b/mailnews/test/data/multipart-message-1.eml
@@ -0,0 +1,42 @@
+To: test@example.com
+From: test@example.com
+Subject: Test message with attached message
+Message-ID: <8259dd8e-2293-8765-e720-61dfcd10a6f3@example.com>
+Date: Sat, 30 Dec 2017 19:12:38 +0100
+User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101
+ Thunderbird/59.0a1
+MIME-Version: 1.0
+Content-Type: multipart/mixed;
+ boundary="------------9B410E80D6DA0868F068B0E4"
+Content-Language: en-GB
+
+This is a multi-part message in MIME format.
+--------------9B410E80D6DA0868F068B0E4
+Content-Type: text/plain; charset=windows-1252; format=flowed
+Content-Transfer-Encoding: 7bit
+
+This is a test message with an attached message.
+
+
+--------------9B410E80D6DA0868F068B0E4
+Content-Type: message/rfc822;
+ name="attached-message.eml"
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment;
+ filename="attached-message.eml"
+
+To: test@example.com
+From: test@example.com
+Subject: Attached message (plaintext)
+Message-ID: <8259dd8e-2293-8765-e720-61dfcd10a6f4@example.com>
+Date: Sat, 30 Dec 2017 19:12:38 +0100
+User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101
+ Thunderbird/59.0a1
+MIME-Version: 1.0
+Content-Type: text/html; charset=utf-8; format=flowed
+Content-Transfer-Encoding: 8bit
+
+Here is the body of the attached message. Search for bodyOfAttachedMessagePlain.
+
+
+--------------9B410E80D6DA0868F068B0E4--
new file mode 100644
--- /dev/null
+++ b/mailnews/test/data/multipart-message-2.eml
@@ -0,0 +1,42 @@
+To: test@example.com
+From: test@example.com
+Subject: Test message with attached message
+Message-ID: <8259dd8e-2293-8765-e720-61dfcd10a6f3@example.com>
+Date: Sat, 30 Dec 2017 19:12:38 +0100
+User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101
+ Thunderbird/59.0a1
+MIME-Version: 1.0
+Content-Type: multipart/mixed;
+ boundary="------------9B410E80D6DA0868F068B0E4"
+Content-Language: en-GB
+
+This is a multi-part message in MIME format.
+--------------9B410E80D6DA0868F068B0E4
+Content-Type: text/plain; charset=windows-1252; format=flowed
+Content-Transfer-Encoding: 7bit
+
+This is a test message with an attached message.
+
+
+--------------9B410E80D6DA0868F068B0E4
+Content-Type: message/rfc822;
+ name="attached-message.eml"
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment;
+ filename="attached-message.eml"
+
+To: test@example.com
+From: test@example.com
+Subject: Attached message (plaintext, base64 encoded)
+Message-ID: <8259dd8e-2293-8765-e720-61dfcd10a6f4@example.com>
+Date: Sat, 30 Dec 2017 19:12:38 +0100
+User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101
+ Thunderbird/59.0a1
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8; format=flowed
+Content-Transfer-Encoding: base64
+
+SGVyZSBpcyB0aGUgYm9keSBvZiB0aGUgYXR0YWNoZWQgbWVzc2FnZS4gU2VhcmNoIGZvciBib2R5T2ZBdHRhY2hlZE1lc3NhZ2VQbMOkaW4u
+
+
+--------------9B410E80D6DA0868F068B0E4--
new file mode 100644
--- /dev/null
+++ b/mailnews/test/data/multipart-message-3.eml
@@ -0,0 +1,57 @@
+To: test@example.com
+From: test@example.com
+Subject: Test message with attached message
+Message-ID: <8259dd8e-2293-8765-e720-61dfcd10a6f3@example.com>
+Date: Sat, 30 Dec 2017 19:12:38 +0100
+User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101
+ Thunderbird/59.0a1
+MIME-Version: 1.0
+Content-Type: multipart/mixed;
+ boundary="------------9B410E80D6DA0868F068B0E4"
+Content-Language: en-GB
+
+This is a multi-part message in MIME format.
+--------------9B410E80D6DA0868F068B0E4
+Content-Type: text/plain; charset=windows-1252; format=flowed
+Content-Transfer-Encoding: 7bit
+
+This is a test message with an attached message.
+
+
+--------------9B410E80D6DA0868F068B0E4
+Content-Type: message/rfc822;
+ name="attached-message.eml"
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment;
+ filename="attached-message.eml"
+
+To: test@example.com
+From: test@example.com
+Subject: Attached message (plaintext + HMTL)
+Message-ID: <a30f750d-d56c-8a52-971c-f95a131e8332@example.com>
+Date: Sat, 30 Dec 2017 19:31:21 +0100
+User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101
+ Thunderbird/59.0a1
+MIME-Version: 1.0
+Content-Type: multipart/alternative;
+ boundary="------------FAB286B8794CC63C0A0FD1BB"
+Content-Language: de-DE
+
+This is a multi-part message in MIME format.
+--------------FAB286B8794CC63C0A0FD1BB
+Content-Type: text/plain; charset=UTF-8; format=flowed
+Content-Transfer-Encoding: 8bit
+
+Here is the body of the attached message. Search for bodyOfAttachedMessagePläin.
+
+
+--------------FAB286B8794CC63C0A0FD1BB
+Content-Type: text/html; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+<body>Here is the body of the attached message. Search for bodyOfAttachedMessageHTML.</body>
+
+--------------FAB286B8794CC63C0A0FD1BB--
+
+
+--------------9B410E80D6DA0868F068B0E4--
new file mode 100644
--- /dev/null
+++ b/mailnews/test/data/multipart-message-4.eml
@@ -0,0 +1,57 @@
+To: test@example.com
+From: test@example.com
+Subject: Test message with attached message
+Message-ID: <8259dd8e-2293-8765-e720-61dfcd10a6f3@example.com>
+Date: Sat, 30 Dec 2017 19:12:38 +0100
+User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101
+ Thunderbird/59.0a1
+MIME-Version: 1.0
+Content-Type: multipart/mixed;
+ boundary="------------9B410E80D6DA0868F068B0E4"
+Content-Language: en-GB
+
+This is a multi-part message in MIME format.
+--------------9B410E80D6DA0868F068B0E4
+Content-Type: text/plain; charset=windows-1252; format=flowed
+Content-Transfer-Encoding: 7bit
+
+This is a test message with an attached message.
+
+
+--------------9B410E80D6DA0868F068B0E4
+Content-Type: message/rfc822;
+ name="attached-message.eml"
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment;
+ filename="attached-message.eml"
+
+To: test@example.com
+From: test@example.com
+Subject: Attached message (plaintext + HMTL, both base64 encoded)
+Message-ID: <a30f750d-d56c-8a52-971c-f95a131e8332@example.com>
+Date: Sat, 30 Dec 2017 19:31:21 +0100
+User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101
+ Thunderbird/59.0a1
+MIME-Version: 1.0
+Content-Type: multipart/alternative;
+ boundary="------------FAB286B8794CC63C0A0FD1BB"
+Content-Language: de-DE
+
+This is a multi-part message in MIME format.
+--------------FAB286B8794CC63C0A0FD1BB
+Content-Type: text/plain; charset=windows-1252; format=flowed
+Content-Transfer-Encoding: base64
+
+SGVyZSBpcyB0aGUgYm9keSBvZiB0aGUgYXR0YWNoZWQgbWVzc2FnZS4gU2VhcmNoIGZvciBib2R5T2ZBdHRhY2hlZE1lc3NhZ2VQbGFpbi4=
+
+
+--------------FAB286B8794CC63C0A0FD1BB
+Content-Type: text/html; charset=utf-8
+Content-Transfer-Encoding: base64
+
+PGJvZHk+SGVyZSBpcyB0aGUgYm9keSBvZiB0aGUgYXR0YWNoZWQgbWVzc2FnZS4gU2VhcmNoIGZvciBib2R5T2ZBdHRhY2hlZE1lc3PDpGdlSFRNTC48L2JvZHk+
+
+--------------FAB286B8794CC63C0A0FD1BB--
+
+
+--------------9B410E80D6DA0868F068B0E4--