Bug 1516320 - Fix body search for QP-encoded messages using ISO-2022-JP. r=darktrojan a=jorgk
authorJorg K <jorgk@jorgk.com>
Wed, 26 Dec 2018 22:56:53 +0100
changeset 33907 e61569181ade20e11058207d79b10ee4647f45de
parent 33906 a69547fa6bb2c84e2f450944981db81d82752cf5
child 33908 81e801fbbdbd9df6bc9c8a52db95e697179263b8
push id388
push userclokep@gmail.com
push dateMon, 28 Jan 2019 20:54:56 +0000
reviewersdarktrojan, jorgk
bugs1516320
Bug 1516320 - Fix body search for QP-encoded messages using ISO-2022-JP. r=darktrojan a=jorgk
mailnews/base/search/public/nsMsgBodyHandler.h
mailnews/base/search/src/nsMsgBodyHandler.cpp
mailnews/base/search/src/nsMsgSearchTerm.cpp
mailnews/base/test/unit/test_searchBody.js
mailnews/test/data/iso-2022-jp-not-qp.eml
mailnews/test/data/iso-2022-jp-qp.eml
--- a/mailnews/base/search/public/nsMsgBodyHandler.h
+++ b/mailnews/base/search/public/nsMsgBodyHandler.h
@@ -31,16 +31,17 @@ public:
     const char * headers /* NULL terminated list of headers */,
     uint32_t headersSize, bool ForFilters);
 
   virtual ~nsMsgBodyHandler();
 
   // Returns next message line in buf and the applicable charset, if found.
   // The return value is the length of 'buf' or -1 for EOF.
   int32_t GetNextLine(nsCString &buf, nsCString &charset);
+  bool IsQP() { return m_partIsQP; }
 
   // Transformations
   void SetStripHeaders (bool strip) { m_stripHeaders = strip; }
 
 protected:
   void Initialize();  // common initialization code
 
   // filter related methods. For filtering we always use the headers
@@ -86,16 +87,17 @@ protected:
   nsCOMPtr<nsIMsgDBHdr> m_msgHdr;
   nsCOMPtr<nsIMsgDatabase> m_db;
 
   // Transformations
   // With the exception of m_isMultipart, these all apply to the various parts
   bool m_stripHeaders;    // true if we're supposed to strip of message headers
   bool m_pastMsgHeaders;  // true if we've already skipped over the message headers
   bool m_pastPartHeaders; // true if we've already skipped over the part headers
+  bool m_partIsQP;        // true if the Content-Transfer-Encoding header claims quoted-printable
   bool m_partIsHtml;      // true if the Content-type header claims text/html
   bool m_base64part;      // true if the current part is in base64
   bool m_isMultipart;     // true if the message is a multipart/* message
   bool m_partIsText;      // true if the current part is text/*
   bool m_inMessageAttachment; // true if current part is message/*
 
   nsTArray<nsCString> m_boundaries;  // The boundary strings to look for
   nsCString m_partCharset; // The charset found in the part
--- a/mailnews/base/search/src/nsMsgBodyHandler.cpp
+++ b/mailnews/base/search/src/nsMsgBodyHandler.cpp
@@ -384,16 +384,19 @@ void nsMsgBodyHandler::SniffPossibleMIME
 {
   // Some parts of MIME are case-sensitive and other parts are case-insensitive;
   // specifically, the headers are all case-insensitive and the values we care
   // about are also case-insensitive, with the sole exception of the boundary
   // string, so we can't just take the input line and make it lower case.
   nsCString lowerCaseLine(line);
   ToLowerCase(lowerCaseLine);
 
+  if (StringBeginsWith(lowerCaseLine, NS_LITERAL_CSTRING("content-transfer-encoding:")))
+    m_partIsQP = lowerCaseLine.Find("quoted-printable", /* ignoreCase = */ true) != -1;
+
   if (StringBeginsWith(lowerCaseLine, NS_LITERAL_CSTRING("content-type:")))
   {
     if (lowerCaseLine.Find("text/html", /* ignoreCase = */ true) != -1)
     {
       m_partIsText = true;
       m_partIsHtml = true;
     }
     else if (lowerCaseLine.Find("multipart/", /* ignoreCase = */ true) != -1)
--- a/mailnews/base/search/src/nsMsgSearchTerm.cpp
+++ b/mailnews/base/search/src/nsMsgSearchTerm.cpp
@@ -939,36 +939,25 @@ nsresult nsMsgSearchTerm::MatchBody(nsIM
   uint32_t lines = 0;
 
   // Change the sense of the loop so we don't bail out prematurely
   // on negative terms. i.e. opDoesntContain must look at all lines
   bool boolContinueLoop;
   GetMatchAllBeforeDeciding(&boolContinueLoop);
   result = boolContinueLoop;
 
-  // If there's a '=' in the search term, then we're not going to do
-  // quoted printable decoding. Otherwise we assume everything is
-  // quoted printable. Obviously everything isn't quoted printable, but
-  // since we don't have a MIME parser handy, and we want to err on the
-  // side of too many hits rather than not enough, we'll assume in that
-  // general direction. Blech. ### FIX ME
-  // bug fix #314637: for stateful charsets like ISO-2022-JP, we don't
-  // want to decode quoted printable since it contains '='.
-  bool isQuotedPrintable = !nsMsgI18Nstateful_charset(folderCharset) &&
-    (m_value.utf8String.FindChar('=') == kNotFound);
-
   nsCString compare;
   nsCString charset;
   while (!endOfFile && result == boolContinueLoop)
   {
     if (bodyHan->GetNextLine(buf, charset) >= 0)
     {
       bool softLineBreak = false;
       // Do in-place decoding of quoted printable
-      if (isQuotedPrintable)
+      if (bodyHan->IsQP())
       {
         softLineBreak = StringEndsWith(buf, NS_LITERAL_CSTRING("="));
         MsgStripQuotedPrintable(buf);
         // If soft line break, chop off the last char as well.
         size_t bufLength = buf.Length();
         if ((bufLength > 0) && softLineBreak)
           buf.SetLength(bufLength - 1);
       }
--- a/mailnews/base/test/unit/test_searchBody.js
+++ b/mailnews/base/test/unit/test_searchBody.js
@@ -75,16 +75,23 @@ var Files =
   "../../../data/29-(HTML+embedded-image)+attachment.eml",
   "../../../data/30-plaintext+(HTML+embedded-image)+attachment.eml",  // using windows-1252
 
   // Messages with message attachments, Content-Type: message/rfc822.
   "../../../data/multipart-message-1.eml",  // plaintext, has "bodyOfAttachedMessagePlain"
   "../../../data/multipart-message-2.eml",  // plaintext, base64, non-ASCII, has "bodyOfAttachedMessagePläin"
   "../../../data/multipart-message-3.eml",  // plaintext+HTML, non-ASCII in plaintext, has "bodyOfAttachedMessagePläin"
   "../../../data/multipart-message-4.eml",  // plaintext+HTML, non-ASCII in HTML, has "bodyOfAttachedMessägeHTML"
+
+  // Message using ISO-2022-JP and CTE: quoted-printable.
+  "../../../data/iso-2022-jp-qp.eml",  // plaintext, has 日本 (Japan), we shouldn't find =1B$BF|K.
+
+  // Message using ISO-2022-JP and 7bit, but containing something that looks like quoted-printable.
+  // (bug 314637).
+  "../../../data/iso-2022-jp-not-qp.eml",  // plaintext, has 現況 which contains =67.
 ]
 var Tests =
 [
   /* Translate Base64 messages */
   // "World!" is contained in three messages, but in bug132340 it's not in a text
   // part and should not be found.
   { value: "World!", op: Contains, count: 2 },
   /* Don't match the base64 text */
@@ -142,16 +149,21 @@ var Tests =
 
   // Test that we don't find anything in HTML tags.
   { value: "ShouldNotFindThis", op: Contains, count: 0 },
   { value: "ShouldntFindThisEither", op: Contains, count: 0 },
   { value: "ShouldntFindHref", op: Contains, count: 0 },
   { value: "ShouldNotFindAcrossLines", op: Contains, count: 0 },
   { value: "ShouldFindThisAgain", op: Contains, count: 2 },
   { value: "ShouldFind AcrossLines", op: Contains, count: 2 },
+
+  // Test for ISO-2022-JP and CTE: quoted-printable, also 7bit looking like quoted-printable.
+  { value: "日本", op: Contains, count: 1 },
+  { value: "=1B$BF|K", op: Contains, count: 0 },
+  { value: "現況", op: Contains, count: 1 },
 ];
 
 function fixFile(file) {
   var fstream = Cc["@mozilla.org/network/file-input-stream;1"]
                   .createInstance(Ci.nsIFileInputStream);
   fstream.init(file, -1, -1, Ci.nsIFileInputStream.CLOSE_ON_EOF);
   var sstream = Cc["@mozilla.org/scriptableinputstream;1"]
                   .createInstance(Ci.nsIScriptableInputStream);
new file mode 100644
--- /dev/null
+++ b/mailnews/test/data/iso-2022-jp-not-qp.eml
@@ -0,0 +1,14 @@
+To: test@example.com
+From: test@example.com
+Subject: ISO-2022-JP and 7bit containing =67 and hence looking like quoted-printable
+Message-ID: <10a2aa17-e92f-417c-864e-575d4e371702@example.com>
+Date: Tue, 3 Apr 2018 19:09:16 +0900
+User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101
+ Thunderbird/52.6.0
+MIME-Version: 1.0
+Content-Type: text/plain; charset=ISO-2022-JP; format=flowed
+Content-Language: ja-JP
+Content-Transfer-Encoding: 7bit
+
+$B8=67(B
+
new file mode 100644
--- /dev/null
+++ b/mailnews/test/data/iso-2022-jp-qp.eml
@@ -0,0 +1,14 @@
+To: test@example.com
+From: test@example.com
+Subject: ISO-2022-JP and quoted-printable
+Message-ID: <10a2aa17-e92f-417c-864e-575d4e371702@example.com>
+Date: Tue, 3 Apr 2018 19:09:16 +0900
+User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101
+ Thunderbird/52.6.0
+MIME-Version: 1.0
+Content-Type: text/plain; charset=ISO-2022-JP; format=flowed
+Content-Language: ja-JP
+Content-Transfer-Encoding: quoted-printable
+
+=1B$BF|K\8l$NK\J8$,=1B(BQuoted Printable=1B$B$K$J$C$F$$$k$b$N=1B(B
+