Bug 1556478 - Fix Japanese detection and re-enable tests. r=hsivonen
authorJorg K <jorgk@jorgk.com>
Tue, 04 Jun 2019 16:19:06 +0200
changeset 35762 dda6d487c69a5f85cb2d34bfae4825060ca43d31
parent 35761 098d414457e2e508ff5099da6f4b53a46ba5c00f
child 35763 def9f7f3af210dc5ec4474802cded6c249ee4166
push id392
push userclokep@gmail.com
push dateMon, 02 Sep 2019 20:17:19 +0000
reviewershsivonen
bugs1556478
Bug 1556478 - Fix Japanese detection and re-enable tests. r=hsivonen
mailnews/base/util/nsMsgUtils.cpp
mailnews/compose/test/unit/test_detectAttachmentCharset.js
mailnews/import/test/unit/xpcshell.ini
mailnews/mime/src/comi18n.cpp
--- a/mailnews/base/util/nsMsgUtils.cpp
+++ b/mailnews/base/util/nsMsgUtils.cpp
@@ -78,16 +78,18 @@
 #include "locale.h"
 #include "nsStringStream.h"
 #include "nsIInputStreamPump.h"
 #include "nsIInputStream.h"
 #include "nsIChannel.h"
 #include "nsIURIMutator.h"
 #include "mozilla/Unused.h"
 #include "mozilla/Preferences.h"
+#include "mozilla/Encoding.h"
+#include "mozilla/JapaneseDetector.h"
 
 /* for logging to Error Console */
 #include "nsIScriptError.h"
 #include "nsIConsoleService.h"
 
 // Log an error string to the error console
 // (adapted from nsContentUtils::LogSimpleConsoleError).
 // Flag can indicate error, warning or info.
@@ -1830,55 +1832,66 @@ NS_MSG_BASE nsresult MsgDetectCharsetFro
   if (!aCharset.IsEmpty()) return NS_OK;
 
   // Position back to the beginning.
   nsCOMPtr<nsISeekableStream> seekStream = do_QueryInterface(inputStream);
   if (seekStream) seekStream->Seek(nsISeekableStream::NS_SEEK_SET, 0);
 
   // Use detector.
   nsCOMPtr<nsICharsetDetector> detector;
+  mozilla::UniquePtr<mozilla::JapaneseDetector> japaneseDetector;
   nsAutoCString detectorName;
   Preferences::GetLocalizedCString("intl.charset.detector", detectorName);
   if (!detectorName.IsEmpty()) {
     // We recognize one of the two magic strings for Russian and Ukranian.
     if (detectorName.EqualsLiteral("ruprob")) {
       detector = new nsRUProbDetector();
     } else if (detectorName.EqualsLiteral("ukprob")) {
       detector = new nsUKProbDetector();
+    } else if (detectorName.EqualsLiteral("ja_parallel_state_machine")) {
+      japaneseDetector = mozilla::JapaneseDetector::Create(true);
     }
   }
 
   if (detector) {
     RefPtr<CharsetDetectionObserver> observer = new CharsetDetectionObserver();
 
     rv = detector->Init(observer);
     NS_ENSURE_SUCCESS(rv, rv);
 
     char buffer[1024];
     uint32_t numRead = 0;
     bool dontFeed = false;
     while (NS_SUCCEEDED(inputStream->Read(buffer, sizeof(buffer), &numRead))) {
-      // XXX: We need to break early here to work around a problem in Shift-JIS
-      // detection. If we call `DoIt()` with any empty buffer, Shift-JIS is not
-      // detected, however ISO-2022-JP is detected.
-      if (numRead == 0) break;
       detector->DoIt(buffer, numRead, &dontFeed);
       NS_ENSURE_SUCCESS(rv, rv);
-      if (dontFeed)  // XXX: We should really break here with:
-                     // if (dontFeed || numRead == 0).
-        break;
+      if (dontFeed || numRead == 0) break;
     }
     rv = detector->Done();
     NS_ENSURE_SUCCESS(rv, rv);
 
     observer->GetDetectedCharset(aCharset);
+  } else if (japaneseDetector) {
+    char buffer[1024];
+    uint32_t numRead = 0;
+    while (NS_SUCCEEDED(inputStream->Read(buffer, sizeof(buffer), &numRead))) {
+      mozilla::Span<const uint8_t> src =
+          mozilla::AsBytes(mozilla::MakeSpan(buffer, numRead));
+      auto encoding = japaneseDetector->Feed(src, (numRead == 0));
+      if (encoding) {
+        encoding->Name(aCharset);
+        break;
+      }
+      if (numRead == 0) {
+        break;
+      }
+    }
+    if (aCharset.EqualsLiteral("ISO-2022-JP")) return NS_OK;
   }
 
-  if (aCharset.EqualsLiteral("ISO-2022-JP")) return NS_OK;
-
   // Rewind file again.
   seekStream->Seek(nsISeekableStream::NS_SEEK_SET, 0);
 
   // Check UTF-8.
   if (IsStreamUTF8(inputStream)) {
     aCharset.AssignLiteral("UTF-8");
     return NS_OK;
   }
--- a/mailnews/compose/test/unit/test_detectAttachmentCharset.js
+++ b/mailnews/compose/test/unit/test_detectAttachmentCharset.js
@@ -60,18 +60,18 @@ async function testWindows1252() {
   await createMessage(do_get_file("data/test-windows-1252.txt"));
   checkAttachmentCharset(null);  // windows-1252 is not directly detected.
 }
 
 var tests = [
   testUTF8,
   testUTF16BE,
   testUTF16LE,
-  // testShiftJIS,
-  // testISO2022JP,
+  testShiftJIS,
+  testISO2022JP,
   testKOI8R,
   testWindows1252,
 ];
 
 function run_test() {
   // Ensure we have at least one mail account
   localAccountUtils.loadLocalMailAccount();
   Services.prefs.setIntPref("mail.strictly_mime.parm_folding", 0);
--- a/mailnews/import/test/unit/xpcshell.ini
+++ b/mailnews/import/test/unit/xpcshell.ini
@@ -12,13 +12,12 @@ run-if = os == 'win'
 run-if = os == 'win'
 [test_csv_import.js]
 [test_csv_import_quote.js]
 [test_ldif_import.js]
 run-if = os == 'win'
 [test_outlook_settings.js]
 run-if = os == 'win'
 [test_shiftjis_csv.js]
-skip-if = true  # See bug 1556478.
 [test_utf16_csv.js]
 [test_vcard_import.js]
 [test_winmail.js]
 run-if = os == 'win'
--- a/mailnews/mime/src/comi18n.cpp
+++ b/mailnews/mime/src/comi18n.cpp
@@ -8,16 +8,18 @@
 #include "nsICharsetDetectionObserver.h"
 #include "nsCyrillicDetector.h"
 #include "nsMsgUtils.h"
 #include "nsServiceManagerUtils.h"
 #include "nsComponentManagerUtils.h"
 #include "nsMsgMimeCID.h"
 #include "nsIMimeConverter.h"
 #include "mozilla/Preferences.h"
+#include "mozilla/Encoding.h"
+#include "mozilla/JapaneseDetector.h"
 
 using namespace mozilla;
 
 ////////////////////////////////////////////////////////////////////////////////
 // BEGIN PUBLIC INTERFACE
 extern "C" {
 
 void MIME_DecodeMimeHeader(const char *header, const char *default_charset,
@@ -52,27 +54,30 @@ class CharsetDetectionObserver : public 
  private:
   virtual ~CharsetDetectionObserver() {}
   nsCString mCharset;
   nsDetectionConfident mConf;
 };
 
 nsresult MIME_detect_charset(const char *aBuf, int32_t aLength,
                              nsACString &aCharset) {
-  nsresult rv = NS_ERROR_UNEXPECTED;
+  nsresult rv;
   nsCOMPtr<nsICharsetDetector> detector;
+  mozilla::UniquePtr<mozilla::JapaneseDetector> japaneseDetector;
   nsAutoCString detectorName;
   Preferences::GetLocalizedCString("intl.charset.detector", detectorName);
 
   if (!detectorName.IsEmpty()) {
     // We recognize one of the two magic strings for Russian and Ukranian.
     if (detectorName.EqualsLiteral("ruprob")) {
       detector = new nsRUProbDetector();
     } else if (detectorName.EqualsLiteral("ukprob")) {
       detector = new nsUKProbDetector();
+    } else if (detectorName.EqualsLiteral("ja_parallel_state_machine")) {
+      japaneseDetector = mozilla::JapaneseDetector::Create(true);
     }
   }
 
   if (detector) {
     nsAutoCString buffer;
 
     RefPtr<CharsetDetectionObserver> observer = new CharsetDetectionObserver();
 
@@ -84,19 +89,35 @@ nsresult MIME_detect_charset(const char 
     rv = detector->DoIt(aBuf, aLength, &dontFeed);
     if (NS_SUCCEEDED(rv)) {
       rv = detector->Done();
       NS_ENSURE_SUCCESS(rv, rv);
       oConfident = observer->GetDetectionConfident();
       if (oConfident == eBestAnswer || oConfident == eSureAnswer) {
         observer->GetDetectedCharset(aCharset);
         return NS_OK;
-      } else {
-        // No luck after all.
-        rv = NS_ERROR_UNEXPECTED;
       }
     }
+  } else if (japaneseDetector) {
+    mozilla::Span<const uint8_t> src =
+        mozilla::AsBytes(mozilla::MakeSpan(aBuf, aLength));
+    auto encoding = japaneseDetector->Feed(src, true);
+    if (encoding) {
+      encoding->Name(aCharset);
+      // If ISO-2022-JP return, since being a 7bit charset, it would be
+      // detected as UTF-8.
+      if (aCharset.EqualsLiteral("ISO-2022-JP")) return NS_OK;
+    }
   }
-  return rv;
+
+  if (IsUTF8(mozilla::MakeSpan(aBuf, aLength))) {
+    aCharset.AssignLiteral("UTF-8");
+    return NS_OK;
+  }
+
+  // No UTF-8 detected, use previous detection result, if any.
+  if (!aCharset.IsEmpty()) return NS_OK;
+
+  return NS_ERROR_UNEXPECTED;
 }
 
 } /* end of extern "C" */
 // END PUBLIC INTERFACE