--- a/content/base/public/nsContentUtils.h
+++ b/content/base/public/nsContentUtils.h
@@ -518,17 +518,17 @@ public:
* UTF-16BE, UTF-32LE, UTF-32BE.
*
* @param aBuffer the buffer to check
* @param aLength the length of the buffer
* @param aCharset empty if not found
* @return boolean indicating whether a BOM was detected.
*/
static PRBool CheckForBOM(const unsigned char* aBuffer, PRUint32 aLength,
- nsACString& aCharset);
+ nsACString& aCharset, PRBool *bigEndian = nsnull);
/**
* Determine whether aContent is in some way associated with aForm. If the
* form is a container the only elements that are considered to be associated
* with a form are the elements that are contained within the form. If the
* form is a leaf element then all elements will be accepted into this list,
* since this can happen due to content fixup when a form spans table rows or
--- a/content/base/src/nsContentUtils.cpp
+++ b/content/base/src/nsContentUtils.cpp
@@ -3166,47 +3166,55 @@ nsContentUtils::ConvertStringFromCharset
nsMemory::Free(ustr);
return rv;
}
/* static */
PRBool
nsContentUtils::CheckForBOM(const unsigned char* aBuffer, PRUint32 aLength,
- nsACString& aCharset)
+ nsACString& aCharset, PRBool *bigEndian)
{
PRBool found = PR_TRUE;
aCharset.Truncate();
if (aLength >= 3 &&
aBuffer[0] == 0xEF &&
aBuffer[1] == 0xBB &&
aBuffer[2] == 0xBF) {
aCharset = "UTF-8";
}
else if (aLength >= 4 &&
aBuffer[0] == 0x00 &&
aBuffer[1] == 0x00 &&
aBuffer[2] == 0xFE &&
aBuffer[3] == 0xFF) {
- aCharset = "UTF-32BE";
+ aCharset = "UTF-32";
+ if (bigEndian)
+ *bigEndian = PR_TRUE;
}
else if (aLength >= 4 &&
aBuffer[0] == 0xFF &&
aBuffer[1] == 0xFE &&
aBuffer[2] == 0x00 &&
aBuffer[3] == 0x00) {
- aCharset = "UTF-32LE";
+ aCharset = "UTF-32";
+ if (bigEndian)
+ *bigEndian = PR_FALSE;
}
else if (aLength >= 2 &&
aBuffer[0] == 0xFE && aBuffer[1] == 0xFF) {
- aCharset = "UTF-16BE";
+ aCharset = "UTF-16";
+ if (bigEndian)
+ *bigEndian = PR_TRUE;
}
else if (aLength >= 2 &&
aBuffer[0] == 0xFF && aBuffer[1] == 0xFE) {
- aCharset = "UTF-16LE";
+ aCharset = "UTF-16";
+ if (bigEndian)
+ *bigEndian = PR_FALSE;
} else {
found = PR_FALSE;
}
return found;
}
/* static */
--- a/content/base/src/nsScriptLoader.cpp
+++ b/content/base/src/nsScriptLoader.cpp
@@ -761,24 +761,24 @@ DetectByteOrderMark(const unsigned char*
// Win2K UTF-8 BOM
oCharset.Assign("UTF-8");
}
break;
case 0xFE:
if (0xFF == aBytes[1]) {
// FE FF
// UTF-16, big-endian
- oCharset.Assign("UTF-16BE");
+ oCharset.Assign("UTF-16");
}
break;
case 0xFF:
if (0xFE == aBytes[1]) {
// FF FE
// UTF-16, little-endian
- oCharset.Assign("UTF-16LE");
+ oCharset.Assign("UTF-16");
}
break;
}
return !oCharset.IsEmpty();
}
/* static */ nsresult
nsScriptLoader::ConvertToUTF16(nsIChannel* aChannel, const PRUint8* aData,
--- a/dom/locales/en-US/chrome/charsetTitles.properties
+++ b/dom/locales/en-US/chrome/charsetTitles.properties
@@ -75,20 +75,21 @@ x-gbk.title = Chinese Simplified (GBK)
iso-2022-cn.title = Chinese Simplified (ISO-2022-CN)
euc-kr.title = Korean (EUC-KR)
x-johab.title = Korean (JOHAB)
x-windows-949.title = Korean (UHC)
iso-2022-kr.title = Korean (ISO-2022-KR)
utf-7.title = Unicode (UTF-7)
utf-8.title = Unicode (UTF-8)
utf-16.title = Unicode (UTF-16)
-utf-16le.title = Unicode (UTF-16 Little Endian)
-utf-16be.title = Unicode (UTF-16 Big Endian)
-utf-32le.title = Unicode (UTF-32 Little Endian)
-utf-32be.title = Unicode (UTF-32 Big Endian)
+utf-16le.title = Unicode (UTF-16LE)
+utf-16be.title = Unicode (UTF-16BE)
+utf-32.title = Unicode (UTF-32)
+utf-32le.title = Unicode (UTF-32LE)
+utf-32be.title = Unicode (UTF-32BE)
iso-8859-5.title = Cyrillic (ISO-8859-5)
iso-ir-111.title = Cyrillic (ISO-IR-111)
windows-1251.title = Cyrillic (Windows-1251)
x-mac-cyrillic.title = Cyrillic (MacCyrillic)
x-mac-ukrainian.title = Cyrillic/Ukrainian (MacUkrainian)
koi8-r.title = Cyrillic (KOI8-R)
koi8-u.title = Cyrillic/Ukrainian (KOI8-U)
iso-8859-7.title = Greek (ISO-8859-7)
--- a/extensions/universalchardet/src/base/nsUniversalDetector.cpp
+++ b/extensions/universalchardet/src/base/nsUniversalDetector.cpp
@@ -120,33 +120,33 @@ nsresult nsUniversalDetector::HandleData
mDetectedCharset = "UTF-8";
break;
case '\xFE':
if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
else if ('\xFF' == aBuf[1])
// FE FF UTF-16, big endian BOM
- mDetectedCharset = "UTF-16BE";
+ mDetectedCharset = "UTF-16";
break;
case '\x00':
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
// 00 00 FE FF UTF-32, big-endian BOM
- mDetectedCharset = "UTF-32BE";
+ mDetectedCharset = "UTF-32";
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
break;
case '\xFF':
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FF FE 00 00 UTF-32, little-endian BOM
- mDetectedCharset = "UTF-32LE";
+ mDetectedCharset = "UTF-32";
else if ('\xFE' == aBuf[1])
// FF FE UTF-16, little endian BOM
- mDetectedCharset = "UTF-16LE";
+ mDetectedCharset = "UTF-16";
break;
} // switch
if (mDetectedCharset)
{
mDone = PR_TRUE;
return NS_OK;
}
--- a/intl/chardet/src/nsMetaCharsetObserver.cpp
+++ b/intl/chardet/src/nsMetaCharsetObserver.cpp
@@ -291,16 +291,17 @@ NS_IMETHODIMP nsMetaCharsetObserver::Not
nsCAutoString preferred;
res2 = mAlias->GetPreferred(newCharset, preferred);
if(NS_SUCCEEDED(res2))
{
// following charset should have been detected by parser
if (!preferred.EqualsLiteral("UTF-16") &&
!preferred.EqualsLiteral("UTF-16BE") &&
!preferred.EqualsLiteral("UTF-16LE") &&
+ !preferred.EqualsLiteral("UTF-32") &&
!preferred.EqualsLiteral("UTF-32BE") &&
!preferred.EqualsLiteral("UTF-32LE")) {
// Propagate the error message so that the parser can
// shutdown correctly. - Ref. Bug 96440
res = NotifyWebShell(aWebShell,
aChannel,
preferred.get(),
kCharsetFromMetaTag);
@@ -370,16 +371,17 @@ NS_IMETHODIMP nsMetaCharsetObserver::Get
// compare against the current charset,
// also some charsets which should have been found in
// the BOM detection.
nsString* currentCharset = values->StringAt(numOfAttributes-3);
if (!preferred.Equals(NS_LossyConvertUTF16toASCII(*currentCharset)) &&
!preferred.EqualsLiteral("UTF-16") &&
!preferred.EqualsLiteral("UTF-16BE") &&
!preferred.EqualsLiteral("UTF-16LE") &&
+ !preferred.EqualsLiteral("UTF-32") &&
!preferred.EqualsLiteral("UTF-32BE") &&
!preferred.EqualsLiteral("UTF-32LE"))
AppendASCIItoUTF16(preferred, aCharset);
}
}
}
return res;
--- a/intl/uconv/src/charsetalias.properties
+++ b/intl/uconv/src/charsetalias.properties
@@ -80,16 +80,17 @@ iso-8859-15=ISO-8859-15
iso-8859-16=ISO-8859-16
iso-ir-111=ISO-IR-111
iso-2022-cn=ISO-2022-CN
iso-2022-cn-ext=ISO-2022-CN
iso-2022-kr=ISO-2022-KR
iso-2022-jp=ISO-2022-JP
utf-32be=UTF-32BE
utf-32le=UTF-32LE
+utf-32=UTF-32
utf-16be=UTF-16BE
utf-16le=UTF-16LE
utf-16=UTF-16
windows-1250=windows-1250
windows-1251=windows-1251
windows-1252=windows-1252
windows-1253=windows-1253
windows-1254=windows-1254
--- a/intl/uconv/src/nsUConvModule.cpp
+++ b/intl/uconv/src/nsUConvModule.cpp
@@ -339,16 +339,17 @@ NS_UCONV_REG_UNREG("armscii-8", NS_ARMSC
NS_UCONV_REG_UNREG("x-viet-tcvn5712", NS_TCVN5712TOUNICODE_CID, NS_UNICODETOTCVN5712_CID)
NS_UCONV_REG_UNREG("VISCII", NS_VISCIITOUNICODE_CID, NS_UNICODETOVISCII_CID)
NS_UCONV_REG_UNREG("x-viet-vps", NS_VPSTOUNICODE_CID, NS_UNICODETOVPS_CID)
NS_UCONV_REG_UNREG("UTF-7", NS_UTF7TOUNICODE_CID, NS_UNICODETOUTF7_CID)
NS_UCONV_REG_UNREG("x-imap4-modified-utf7", NS_MUTF7TOUNICODE_CID, NS_UNICODETOMUTF7_CID)
NS_UCONV_REG_UNREG("UTF-16", NS_UTF16TOUNICODE_CID, NS_UNICODETOUTF16_CID)
NS_UCONV_REG_UNREG("UTF-16BE", NS_UTF16BETOUNICODE_CID, NS_UNICODETOUTF16BE_CID)
NS_UCONV_REG_UNREG("UTF-16LE", NS_UTF16LETOUNICODE_CID, NS_UNICODETOUTF16LE_CID)
+NS_UCONV_REG_UNREG("UTF-32", NS_UTF32TOUNICODE_CID, NS_UNICODETOUTF32_CID)
NS_UCONV_REG_UNREG("UTF-32BE", NS_UTF32BETOUNICODE_CID, NS_UNICODETOUTF32BE_CID)
NS_UCONV_REG_UNREG("UTF-32LE", NS_UTF32LETOUNICODE_CID, NS_UNICODETOUTF32LE_CID)
NS_UCONV_REG_UNREG("T.61-8bit", NS_T61TOUNICODE_CID, NS_UNICODETOT61_CID)
NS_UCONV_REG_UNREG("x-user-defined", NS_USERDEFINEDTOUNICODE_CID, NS_UNICODETOUSERDEFINED_CID)
NS_UCONV_REG_UNREG("x-mac-arabic" , NS_MACARABICTOUNICODE_CID, NS_UNICODETOMACARABIC_CID)
NS_UCONV_REG_UNREG("x-mac-devanagari" , NS_MACDEVANAGARITOUNICODE_CID, NS_UNICODETOMACDEVANAGARI_CID)
NS_UCONV_REG_UNREG("x-mac-farsi" , NS_MACFARSITOUNICODE_CID, NS_UNICODETOMACFARSI_CID)
NS_UCONV_REG_UNREG("x-mac-gurmukhi" , NS_MACGURMUKHITOUNICODE_CID, NS_UNICODETOMACGURMUKHI_CID)
@@ -413,25 +414,27 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicode
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF8ToUnicode)
// ucvlatin
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF7ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsMUTF7ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF16ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF16BEToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF16LEToUnicode)
+NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF32ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF32BEToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF32LEToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF7)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToMUTF7)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF16BE)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF16LE)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF16)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF32BE)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF32LE)
+NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF32)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToTSCII)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToTamilTTF)
// ucvibm
// ucvja
NS_GENERIC_FACTORY_CONSTRUCTOR(nsShiftJISToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsEUCJPToUnicodeV2)
@@ -963,16 +966,21 @@ static const nsModuleComponentInfo compo
nsUTF16BEToUnicodeConstructor ,
},
{
DECODER_NAME_BASE "UTF-16LE" , NS_UTF16LETOUNICODE_CID,
NS_UNICODEDECODER_CONTRACTID_BASE "UTF-16LE",
nsUTF16LEToUnicodeConstructor ,
},
{
+ DECODER_NAME_BASE "UTF-32" , NS_UTF32TOUNICODE_CID,
+ NS_UNICODEDECODER_CONTRACTID_BASE "UTF-32",
+ nsUTF32ToUnicodeConstructor ,
+ },
+ {
DECODER_NAME_BASE "UTF-32BE" , NS_UTF32BETOUNICODE_CID,
NS_UNICODEDECODER_CONTRACTID_BASE "UTF-32BE",
nsUTF32BEToUnicodeConstructor ,
},
{
DECODER_NAME_BASE "UTF-32LE" , NS_UTF32LETOUNICODE_CID,
NS_UNICODEDECODER_CONTRACTID_BASE "UTF-32LE",
nsUTF32LEToUnicodeConstructor ,
@@ -1277,17 +1285,22 @@ static const nsModuleComponentInfo compo
NS_UNICODEENCODER_CONTRACTID_BASE "UTF-32BE",
nsUnicodeToUTF32BEConstructor,
},
{
ENCODER_NAME_BASE "UTF-32LE" , NS_UNICODETOUTF32LE_CID,
NS_UNICODEENCODER_CONTRACTID_BASE "UTF-32LE",
nsUnicodeToUTF32LEConstructor,
},
- {
+ {
+ ENCODER_NAME_BASE "UTF-32" , NS_UNICODETOUTF32_CID,
+ NS_UNICODEENCODER_CONTRACTID_BASE "UTF-32",
+ nsUnicodeToUTF32Constructor,
+ },
+ {
ENCODER_NAME_BASE "T.61-8bit" , NS_UNICODETOT61_CID,
NS_UNICODEENCODER_CONTRACTID_BASE "T.61-8bit",
nsUnicodeToT61Constructor,
},
{
ENCODER_NAME_BASE "x-user-defined" , NS_UNICODETOUSERDEFINED_CID,
NS_UNICODEENCODER_CONTRACTID_BASE "x-user-defined",
nsUnicodeToUserDefinedConstructor,
new file mode 100644
--- /dev/null
+++ b/intl/uconv/tests/unit/test_bug335531.js
@@ -0,0 +1,228 @@
+/* Test case for bug 335531
+ *
+ * Uses nsIConverterInputStream to decode UTF-16 text with all combinations
+ * of UTF-16BE and UTF-16LE with and without BOM.
+ *
+ * Sample text is: "Все счастливые семьи похожи друг на друга, каждая несчастливая семья несчастлива по-своему."
+ *
+ * The enclosing quotation marks are included in the sample text to test that
+ * UTF-16LE is recognized even when there is no BOM and the UTF-16LE decoder is
+ * not explicitly called. This only works when the first character of the text
+ * is an eight-bit character.
+ */
+
+const beBOM="%00%00%FE%FF";
+const leBOM="%FF%FE%00%00";
+const outBOM="\uFEFF";
+const sampleUTF32BE="%00%00%00%22%00%00%04%12%00%00%04%41%00%00%04%35%00%00%00%20%00%00%04%41%00%00%04%47%00%00%04%30%00%00%04%41%00%00%04%42%00%00%04%3B%00%00%04%38%00%00%04%32%00%00%04%4B%00%00%04%35%00%00%00%20%00%00%04%41%00%00%04%35%00%00%04%3C%00%00%04%4C%00%00%04%38%00%00%00%20%00%00%04%3F%00%00%04%3E%00%00%04%45%00%00%04%3E%00%00%04%36%00%00%04%38%00%00%00%20%00%00%04%34%00%00%04%40%00%00%04%43%00%00%04%33%00%00%00%20%00%00%04%3D%00%00%04%30%00%00%00%20%00%00%04%34%00%00%04%40%00%00%04%43%00%00%04%33%00%00%04%30%00%00%00%2C%00%00%00%20%00%00%04%3A%00%00%04%30%00%00%04%36%00%00%04%34%00%00%04%30%00%00%04%4F%00%00%00%20%00%00%04%3D%00%00%04%35%00%00%04%41%00%00%04%47%00%00%04%30%00%00%04%41%00%00%04%42%00%00%04%3B%00%00%04%38%00%00%04%32%00%00%04%30%00%00%04%4F%00%00%00%20%00%00%04%41%00%00%04%35%00%00%04%3C%00%00%04%4C%00%00%04%4F%00%00%00%20%00%00%04%3D%00%00%04%35%00%00%04%41%00%00%04%47%00%00%04%30%00%00%04%41%00%00%04%42%00%00%04%3B%00%00%04%38%00%00%04%32%00%00%04%30%00%00%00%20%00%00%04%3F%00%00%04%3E%00%00%00%2D%00%00%04%41%00%00%04%32%00%00%04%3E%00%00%04%35%00%00%04%3C%00%00%04%43%00%00%00%2E%00%00%00%22";
+const sampleUTF32LE="%22%00%00%00%12%04%00%00%41%04%00%00%35%04%00%00%20%00%00%00%41%04%00%00%47%04%00%00%30%04%00%00%41%04%00%00%42%04%00%00%3B%04%00%00%38%04%00%00%32%04%00%00%4B%04%00%00%35%04%00%00%20%00%00%00%41%04%00%00%35%04%00%00%3C%04%00%00%4C%04%00%00%38%04%00%00%20%00%00%00%3F%04%00%00%3E%04%00%00%45%04%00%00%3E%04%00%00%36%04%00%00%38%04%00%00%20%00%00%00%34%04%00%00%40%04%00%00%43%04%00%00%33%04%00%00%20%00%00%00%3D%04%00%00%30%04%00%00%20%00%00%00%34%04%00%00%40%04%00%00%43%04%00%00%33%04%00%00%30%04%00%00%2C%00%00%00%20%00%00%00%3A%04%00%00%30%04%00%00%36%04%00%00%34%04%00%00%30%04%00%00%4F%04%00%00%20%00%00%00%3D%04%00%00%35%04%00%00%41%04%00%00%47%04%00%00%30%04%00%00%41%04%00%00%42%04%00%00%3B%04%00%00%38%04%00%00%32%04%00%00%30%04%00%00%4F%04%00%00%20%00%00%00%41%04%00%00%35%04%00%00%3C%04%00%00%4C%04%00%00%4F%04%00%00%20%00%00%00%3D%04%00%00%35%04%00%00%41%04%00%00%47%04%00%00%30%04%00%00%41%04%00%00%42%04%00%00%3B%04%00%00%38%04%00%00%32%04%00%00%30%04%00%00%20%00%00%00%3F%04%00%00%3E%04%00%00%2D%00%00%00%41%04%00%00%32%04%00%00%3E%04%00%00%35%04%00%00%3C%04%00%00%43%04%00%00%2E%00%00%00%22%00%00%00";
+const expectedNoBOM = "\"\u0412\u0441\u0435 \u0441\u0447\u0430\u0441\u0442\u043B\u0438\u0432\u044B\u0435 \u0441\u0435\u043C\u044C\u0438 \u043F\u043E\u0445\u043E\u0436\u0438 \u0434\u0440\u0443\u0433 \u043D\u0430 \u0434\u0440\u0443\u0433\u0430, \u043A\u0430\u0436\u0434\u0430\u044F \u043D\u0435\u0441\u0447\u0430\u0441\u0442\u043B\u0438\u0432\u0430\u044F \u0441\u0435\u043C\u044C\u044F \u043D\u0435\u0441\u0447\u0430\u0441\u0442\u043B\u0438\u0432\u0430 \u043F\u043E-\u0441\u0432\u043E\u0435\u043C\u0443.\"";
+
+function makeText(withBOM, charset)
+{
+ var theText = eval("sample" + charset);
+ if (withBOM) {
+ if (charset == "UTF32BE") {
+ theText = beBOM + theText;
+ } else {
+ theText = leBOM + theText;
+ }
+ }
+ return theText;
+}
+
+function testCase(withBOM, charset, charsetDec, decoder, bufferLength)
+{
+ var dataURI = "data:text/plain;charset=" + charsetDec + "," +
+ makeText(withBOM, charset);
+
+ var IOService = Components.Constructor("@mozilla.org/network/io-service;1",
+ "nsIIOService");
+ var ConverterInputStream =
+ Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
+ "nsIConverterInputStream",
+ "init");
+
+ var ios = new IOService();
+ var channel = ios.newChannel(dataURI, "", null);
+ var testInputStream = channel.open();
+ var testConverter = new ConverterInputStream(testInputStream,
+ decoder,
+ bufferLength,
+ 0xFFFD);
+
+ if (!(testConverter instanceof
+ Components.interfaces.nsIUnicharLineInputStream))
+ throw "not line input stream";
+
+ var outStr = "";
+ var more;
+ do {
+ // read the line and check for eof
+ var line = {};
+ more = testConverter.readLine(line);
+ outStr += line.value;
+ } while (more);
+
+ var expected = expectedNoBOM;
+ if (withBOM) {
+ // BE / LE decoder wouldn't strip the BOM
+ if (decoder == "UTF-32BE" || decoder == "UTF-32LE") {
+ expected = outBOM + expectedNoBOM;
+ }
+ }
+
+ do_check_eq(outStr, expected);
+}
+
+// Tests conversion of one to three byte(s) from UTF-32 to Unicode
+
+const expectedString = "\ufffd";
+
+const charset = "UTF-32";
+
+function testCase2(inString) {
+ var ScriptableUnicodeConverter =
+ Components.Constructor("@mozilla.org/intl/scriptableunicodeconverter",
+ "nsIScriptableUnicodeConverter");
+
+ var converter = new ScriptableUnicodeConverter();
+ converter.charset = charset;
+ var outString;
+ try {
+ outString = converter.ConvertToUnicode(inString) + converter.Finish();
+ } catch(e) {
+ outString = "\ufffd";
+ }
+ do_check_eq(escape(outString), escape(expectedString));
+}
+
+/*
+ * Uses nsIConverterInputStream to decode UTF-32 text with surrogate characters
+ *
+ * Sample text is: "g" in Mathematical Bold Symbolls (U+1D420)
+ *
+ * The test uses buffers of 4 different lengths to test end of buffer in mid-
+ * UTF32 character
+ */
+
+// Single supplementaly character
+// expected: surrogate pair
+const test0="%00%00%00%2D%00%00%00%2D%00%01%D4%20%00%00%00%2D%00%00%00%2D";
+const expected0 = "--\uD835\uDC20--";
+// High surrogate followed by low surrogate (invalid in UTF-32)
+// expected: two replacement chars
+const test1="%00%00%00%2D%00%00%00%2D%00%00%D8%35%00%00%DC%20%00%00%00%2D%00%00%00%2D";
+const expected1 = "--\uFFFD\uFFFD--";
+// Lone high surrogate
+// expected: one replacement char
+const test2="%00%00%00%2D%00%00%00%2D%00%00%D8%35%00%00%00%2D%00%00%00%2D";
+const expected2 = "--\uFFFD--";
+// Lone low surrogate
+// expected: one replacement char
+const test3="%00%00%00%2D%00%00%00%2D%00%00%DC%20%00%00%00%2D%00%00%00%2D";
+const expected3 = "--\uFFFD--";
+// Two high surrogates
+// expected: two replacement chars
+const test4="%00%00%00%2D%00%00%00%2D%00%00%D8%35%00%00%D8%35%00%00%00%2D%00%00%00%2D";
+const expected4 = "--\uFFFD\uFFFD--";
+// Two low surrogates
+// expected: two replacement chars
+const test5="%00%00%00%2D%00%00%00%2D%00%00%DC%20%00%00%DC%20%00%00%00%2D%00%00%00%2D";
+const expected5 = "--\uFFFD\uFFFD--";
+// Low surrogate followed by high surrogate
+// expected: two replacement chars
+const test6="%00%00%00%2D%00%00%00%2D%00%00%DC%20%00%00%D8%35%00%00%00%2D%00%00%00%2D";
+const expected6 = "--\uFFFD\uFFFD--";
+// Lone high surrogate followed by supplementaly character
+// expected: replacement char followed by surrogate pair
+const test7="%00%00%00%2D%00%00%00%2D%00%00%D8%35%00%01%D4%20%00%00%00%2D%00%00%00%2D";
+const expected7 = "--\uFFFD\uD835\uDC20--";
+// Lone low surrogate followed by supplementaly character
+// expected: replacement char followed by surrogate pair
+const test8="%00%00%00%2D%00%00%00%2D%00%00%DC%20%00%01%D4%20%00%00%00%2D%00%00%00%2D";
+const expected8 = "--\uFFFD\uD835\uDC20--";
+// Supplementaly character followed by lone high surrogate
+// expected: surrogate pair followed by replacement char
+const test9="%00%00%00%2D%00%00%00%2D%00%01%D4%20%00%00%D8%35%00%00%00%2D%00%00%00%2D";
+const expected9 = "--\uD835\uDC20\uFFFD--";
+// Supplementaly character followed by lone low surrogate
+// expected: surrogate pair followed by replacement char
+const test10="%00%00%00%2D%00%00%00%2D%00%01%D4%20%00%00%DC%20%00%00%00%2D%00%00%00%2D";
+const expected10 = "--\uD835\uDC20\uFFFD--";
+// Lone high surrogate at the end of the input
+// expected: one replacement char (invalid in UTF-32)
+const test11="%00%00%00%2D%00%00%00%2D%00%00%00%2D%00%00%00%2D%00%00%D8%35";
+const expected11 = "----\uFFFD";
+// Half code unit at the end of the input
+// expected: nothing
+const test12="%00%00%00%2D%00%00%00%2D%00%00%00%2D%00%00%00%2D%D8";
+const expected12 = "----";
+
+function testCase3(testNumber, bufferLength)
+{
+ var dataURI = "data:text/plain;charset=UTF32BE," + eval("test" + testNumber);
+
+ var IOService = Components.Constructor("@mozilla.org/network/io-service;1",
+ "nsIIOService");
+ var ConverterInputStream =
+ Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
+ "nsIConverterInputStream",
+ "init");
+
+ var ios = new IOService();
+ var channel = ios.newChannel(dataURI, "", null);
+ var testInputStream = channel.open();
+ var testConverter = new ConverterInputStream(testInputStream,
+ "UTF-32BE",
+ bufferLength,
+ 0xFFFD);
+
+ if (!(testConverter instanceof
+ Components.interfaces.nsIUnicharLineInputStream))
+ throw "not line input stream";
+
+ var outStr = "";
+ var more;
+ do {
+ // read the line and check for eof
+ var line = {};
+ more = testConverter.readLine(line);
+ outStr += line.value;
+ } while (more);
+
+ // escape the strings before comparing for better readability
+ do_check_eq(escape(outStr), escape(eval("expected" + testNumber)));
+}
+
+function run_test()
+{
+ /* BOM charset charset decoder buffer
+ declaration length */
+ testCase(true, "UTF32LE", "UTF-32", "UTF-32", 64);
+ testCase(true, "UTF32BE", "UTF-32", "UTF-32", 64);
+ testCase(true, "UTF32LE", "UTF-32", "UTF-32LE", 64);
+ testCase(true, "UTF32BE", "UTF-32", "UTF-32BE", 64);
+ testCase(false, "UTF32LE", "UTF-32", "UTF-32", 64);
+ testCase(false, "UTF32BE", "UTF-32", "UTF-32", 64);
+ testCase(false, "UTF32LE", "UTF-32", "UTF-32LE", 64);
+ testCase(false, "UTF32BE", "UTF-32", "UTF-32BE", 64);
+ testCase(true, "UTF32LE", "UTF-32", "UTF-32", 65);
+ testCase(true, "UTF32BE", "UTF-32", "UTF-32", 65);
+ testCase(true, "UTF32LE", "UTF-32", "UTF-32LE", 65);
+ testCase(true, "UTF32BE", "UTF-32", "UTF-32BE", 65);
+ testCase(false, "UTF32LE", "UTF-32", "UTF-32", 65);
+ testCase(false, "UTF32BE", "UTF-32", "UTF-32", 65);
+ testCase(false, "UTF32LE", "UTF-32", "UTF-32LE", 65);
+ testCase(false, "UTF32BE", "UTF-32", "UTF-32BE", 65);
+
+ testCase2("A");
+ testCase2("AB");
+ testCase2("ABC");
+
+ for (var test = 0; test <= 12; ++ test) {
+ for (var bufferLength = 4; bufferLength < 8; ++ bufferLength) {
+ testCase3(test, bufferLength);
+ }
+ }
+}
--- a/intl/uconv/ucvlatin/nsUCvLatinCID.h
+++ b/intl/uconv/ucvlatin/nsUCvLatinCID.h
@@ -562,16 +562,21 @@
#define NS_UNICODETOUTF32LE_CID \
{ 0xba6151b6, 0x1dfa, 0x11d3, {0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70}}
// Class ID for our UTF16ToUnicode charset converter
// {d673255d-1184-400a-b0b5-ee9d1295bd85}
#define NS_UTF16TOUNICODE_CID \
{ 0xd673255d, 0x1184, 0x400a, {0xb0, 0xb5, 0xee,0x9d, 0x12, 0x95, 0xbd, 0x85}}
+// Class ID for our UTF32ToUnicode charset converter
+// {30DCD313-73E1-447d-8339-37744952154E}
+#define NS_UTF32TOUNICODE_CID \
+ { 0x30dcd313, 0x73e1, 0x447d, {0x83, 0x39, 0x37, 0x74, 0x49, 0x52, 0x15, 0x4e}}
+
// Class ID for our UTF16LEToUnicode charset converter
// {BA6151B7-1DFA-11d3-B3BF-00805F8A6670}
#define NS_UTF16LETOUNICODE_CID \
{ 0xba6151b7, 0x1dfa, 0x11d3, {0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70}}
// Class ID for our UTF32LEToUnicode charset converter
// {BA6151B8-1DFA-11d3-B3BF-00805F8A6670}
#define NS_UTF32LETOUNICODE_CID \
--- a/intl/uconv/ucvlatin/nsUTF32ToUnicode.cpp
+++ b/intl/uconv/ucvlatin/nsUTF32ToUnicode.cpp
@@ -162,72 +162,148 @@ static nsresult ConvertCommon(const char
return rv;
}
//----------------------------------------------------------------------
// Class nsUTF32ToUnicode [implementation]
-nsUTF32ToUnicode::nsUTF32ToUnicode() : nsBasicDecoderSupport()
+nsUTF32ToUnicodeBase::nsUTF32ToUnicodeBase() : nsBasicDecoderSupport()
{
Reset();
}
//----------------------------------------------------------------------
// Subclassing of nsDecoderSupport class [implementation]
-NS_IMETHODIMP nsUTF32ToUnicode::GetMaxLength(const char * aSrc,
- PRInt32 aSrcLength,
- PRInt32 * aDestLength)
+NS_IMETHODIMP nsUTF32ToUnicodeBase::GetMaxLength(const char * aSrc,
+ PRInt32 aSrcLength,
+ PRInt32 * aDestLength)
{
// Non-BMP characters take two PRUnichars(a pair of surrogate codepoints)
// so that we have to divide by 2 instead of 4 for the worst case.
*aDestLength = aSrcLength / 2;
return NS_OK;
}
//----------------------------------------------------------------------
// Subclassing of nsBasicDecoderSupport class [implementation]
-NS_IMETHODIMP nsUTF32ToUnicode::Reset()
+NS_IMETHODIMP nsUTF32ToUnicodeBase::Reset()
{
// the number of additional bytes to read to complete UTF-32 4byte seq.
mState = 0;
memset(mBufferInc, 0, 4);
return NS_OK;
}
//----------------------------------------------------------------------
// Class nsUTF32BEToUnicode [implementation]
//----------------------------------------------------------------------
-// Subclassing of nsUTF32ToUnicode class [implementation]
+// Subclassing of nsUTF32ToUnicodeBase class [implementation]
NS_IMETHODIMP nsUTF32BEToUnicode::Convert(const char * aSrc,
PRInt32 * aSrcLength,
PRUnichar * aDest,
PRInt32 * aDestLength)
{
return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState,
mBufferInc, PR_FALSE);
}
//----------------------------------------------------------------------
// Class nsUTF32LEToUnicode [implementation]
//----------------------------------------------------------------------
-// Subclassing of nsUTF32ToUnicode class [implementation]
+// Subclassing of nsUTF32ToUnicodeBase class [implementation]
NS_IMETHODIMP nsUTF32LEToUnicode::Convert(const char * aSrc,
PRInt32 * aSrcLength,
PRUnichar * aDest,
PRInt32 * aDestLength)
{
return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState,
mBufferInc, PR_TRUE);
}
+//----------------------------------------------------------------------
+// Class nsUTF32ToUnicode [implementation]
+
+//----------------------------------------------------------------------
+// Subclassing of nsUTF32ToUnicodeBase class [implementation]
+
+NS_IMETHODIMP nsUTF32ToUnicode::Reset()
+{
+ nsresult rv = nsUTF32ToUnicodeBase::Reset();
+ mState = 4;
+ mEndian = kUnknown;
+ mFoundBOM = PR_FALSE;
+ return rv;
+}
+
+NS_IMETHODIMP nsUTF32ToUnicode::Convert(const char * aSrc,
+ PRInt32 * aSrcLength,
+ PRUnichar * aDest,
+ PRInt32 * aDestLength)
+{
+ PRBool foundBOM = PR_FALSE;
+ if (4 == mState) // Called for the first time.
+ {
+ if (*aSrcLength < 4)
+ return NS_ERROR_ILLEGAL_INPUT;
+
+ // check if BOM (0xFEFF) is at the beginning, remove it if found, and
+ // set mEndian accordingly.
+ if (0xFF == PRUint8(aSrc[0]) && 0xFE == PRUint8(aSrc[1]) &&
+ 0 == PRUint8(aSrc[2]) && 0 == PRUint8(aSrc[3])) {
+ aSrc += 4;
+ *aSrcLength -= 4;
+ mState = 0;
+ mEndian = kLittleEndian;
+ mFoundBOM = foundBOM = PR_TRUE;
+ }
+ else if (0 == PRUint8(aSrc[0]) && 0 == PRUint8(aSrc[1]) &&
+ 0xFE == PRUint8(aSrc[2]) && 0xFF == PRUint8(aSrc[3])) {
+ aSrc += 4;
+ *aSrcLength -= 4;
+ mState = 0;
+ mEndian = kBigEndian;
+ mFoundBOM = foundBOM = PR_TRUE;
+ }
+ // BOM is not found, but we can use a simple heuristic to determine
+ // the endianness. Assume the first character is [U+0001, U+FFFF].
+ // Not always valid, but it's very likely to hold for html/xml/css.
+#if 0 // BE case will be handled below
+ else if (!aSrc[0] && !aSrc[1] && (aSrc[2] || aSrc[3])) { // 0x00 0x00 0xhh 0xhh (hh != 00)
+ mState = 0;
+ mEndian = kBigEndian;
+ }
+#endif
+ else if ((aSrc[0] || aSrc[1]) && !aSrc[2] && !aSrc[3]) { // 0xhh 0xhh 0x00 0x00 (hh != 00)
+ mState = 0;
+ mEndian = kLittleEndian;
+ }
+ else { // Neither BOM nor 'plausible' byte patterns at the beginning.
+ // Just assume it's BE (following Unicode standard)
+ // and let the garbage show up in the browser. (security concern?)
+ // (bug 246194)
+ mState = 0;
+ mEndian = kBigEndian;
+ }
+ }
+
+ nsresult rv = ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState,
+ mBufferInc, mEndian == kLittleEndian);
+ if (foundBOM)
+ *aSrcLength += 4; // need to consume BOM
+
+ // If BOM is not found and we're to return NS_OK, signal that BOM
+ // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
+ return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
+}
+
// XXX : What to do with 'unflushed' mBufferInc?? : Finish()
--- a/intl/uconv/ucvlatin/nsUTF32ToUnicode.h
+++ b/intl/uconv/ucvlatin/nsUTF32ToUnicode.h
@@ -37,39 +37,37 @@
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsUTF32ToUnicode_h___
#define nsUTF32ToUnicode_h___
//----------------------------------------------------------------------
-// Class nsUTF32ToUnicode [declaration]
+// Class nsUTF32ToUnicodeBase [declaration]
/**
- * A character set converter from UTF32 to Unicode.
- * The base class for UTF32BE/UTF32LE to Unicode converters.
+ * A character set converter from UTF-32 family to Unicode.
+ * The base class for UTF-32BE/UTF-32LE/UTF-32 to Unicode converters.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
-class nsUTF32ToUnicode : public nsBasicDecoderSupport
+class nsUTF32ToUnicodeBase : public nsBasicDecoderSupport
{
-public:
-
- /**
- * Class constructor.
- */
- nsUTF32ToUnicode();
-
protected:
+ /**
+ * Class constructor. accessible only by child classes
+ */
+ nsUTF32ToUnicodeBase();
+
// the number of additional bytes to read to complete an incomplete UTF-32 4byte seq.
- PRUint16 mState;
+ PRUint16 mState;
// buffer for an incomplete UTF-32 sequence.
PRUint8 mBufferInc[4];
//--------------------------------------------------------------------
// Subclassing of nsBasicDecoderSupport class [declaration]
NS_IMETHOD GetMaxLength(const char * aSrc, PRInt32 aSrcLength,
PRInt32 * aDestLength);
@@ -77,23 +75,23 @@ protected:
NS_IMETHOD Reset();
};
//----------------------------------------------------------------------
// Class nsUTF32BEToUnicode [declaration]
/**
- * A character set converter from UTF32BE to Unicode.
- * A subclass of UTF32ToUnicode.
+ * A character set converter from UTF-32BE to Unicode.
+ * A subclass of UTF32ToUnicodeBase.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
-class nsUTF32BEToUnicode : public nsUTF32ToUnicode
+class nsUTF32BEToUnicode : public nsUTF32ToUnicodeBase
{
public:
//--------------------------------------------------------------------
// Subclassing of nsBasicDecoderSupport class [declaration]
NS_IMETHOD Convert(const char * aSrc, PRInt32 * aSrcLength,
@@ -101,29 +99,66 @@ public:
};
//----------------------------------------------------------------------
// Class nsUTF32LEToUnicode [declaration]
/**
- * A character set converter from UTF32LE to Unicode.
- * A subclass of UTF32ToUnicode.
+ * A character set converter from UTF-32LE to Unicode.
+ * A subclass of UTF32ToUnicodeBase.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
-class nsUTF32LEToUnicode : public nsUTF32ToUnicode
+class nsUTF32LEToUnicode : public nsUTF32ToUnicodeBase
{
public:
//--------------------------------------------------------------------
// Subclassing of nsBasicDecoderSupport class [declaration]
NS_IMETHOD Convert(const char * aSrc, PRInt32 * aSrcLength,
PRUnichar * aDest, PRInt32 * aDestLength);
};
+//----------------------------------------------------------------------
+// Class nsUTF32ToUnicode [declaration]
+
+/**
+ * A character set converter from UTF-32 to Unicode.
+ * A subclass of UTF32ToUnicodeBase.
+ * @created 08/Dec/2002
+ * @author Jungshik Shin
+ */
+
+class nsUTF32ToUnicode : public nsUTF32ToUnicodeBase
+{
+public:
+
+ /**
+ * Class constructor.
+ */
+ nsUTF32ToUnicode() { Reset(); }
+
+ //--------------------------------------------------------------------
+ // Subclassing of nsBasicDecoderSupport class [declaration]
+
+ NS_IMETHOD Convert(const char * aSrc, PRInt32 * aSrcLength,
+ PRUnichar * aDest, PRInt32 * aDestLength);
+
+ //--------------------------------------------------------------------
+ // Subclassing of nsUTF32ToUnicodeBase class [declaration]
+
+ NS_IMETHOD Reset();
+
+private:
+
+ enum Endian {kUnknown, kBigEndian, kLittleEndian};
+ Endian mEndian;
+ PRBool mFoundBOM;
+};
+
#endif /* nsUTF32ToUnicode_h___ */
--- a/intl/uconv/ucvlatin/nsUnicodeToUTF32.cpp
+++ b/intl/uconv/ucvlatin/nsUnicodeToUTF32.cpp
@@ -75,24 +75,37 @@
//----------------------------------------------------------------------
// Static functions common to nsUnicodeToUTF32LE and nsUnicodeToUTF32BE
static nsresult ConvertCommon(const PRUnichar * aSrc,
PRInt32 * aSrcLength,
char * aDest,
PRInt32 * aDestLength,
PRUnichar * aHighSurrogate,
+ PRUnichar * aBOM,
PRBool aIsLE)
{
const PRUnichar * src = aSrc;
const PRUnichar * srcEnd = aSrc + *aSrcLength;
char * dest = aDest;
const char * destEnd = aDest + *aDestLength;
PRUint32 ucs4;
+ // Handle BOM if necessary
+ if (0 != *aBOM)
+ {
+ if (*aDestLength < 4) {
+ *aSrcLength = *aDestLength = 0;
+ return NS_OK_UENC_MOREOUTPUT;
+ }
+
+ *(PRUint32*)dest = *aBOM;
+ *aBOM = 0;
+ dest += 4;
+ }
// left-over high surroage code point from the prev. run.
if (*aHighSurrogate)
{
if (! *aSrcLength)
{
*aDestLength = 0;
return NS_OK_UENC_MOREINPUT;
@@ -189,31 +202,31 @@ static nsresult FinishCommon(char * aDes
return NS_OK;
}
//----------------------------------------------------------------------
// Class nsUnicodeToUTF32 [implementation]
-NS_IMPL_ISUPPORTS1(nsUnicodeToUTF32, nsIUnicodeEncoder)
+NS_IMPL_ISUPPORTS1(nsUnicodeToUTF32Base, nsIUnicodeEncoder)
//----------------------------------------------------------------------
// Subclassing of nsIUnicodeEncoder class [implementation]
-NS_IMETHODIMP nsUnicodeToUTF32::GetMaxLength(const PRUnichar * aSrc,
- PRInt32 aSrcLength,
- PRInt32 * aDestLength)
+NS_IMETHODIMP nsUnicodeToUTF32Base::GetMaxLength(const PRUnichar * aSrc,
+ PRInt32 aSrcLength,
+ PRInt32 * aDestLength)
{
*aDestLength = aSrcLength * 4;
return NS_OK;
}
-NS_IMETHODIMP nsUnicodeToUTF32::FillInfo(PRUint32 *aInfo)
+NS_IMETHODIMP nsUnicodeToUTF32Base::FillInfo(PRUint32 *aInfo)
{
memset(aInfo, 0xFF, (0x10000L >> 3));
return NS_OK;
}
//----------------------------------------------------------------------
// Class nsUnicodeToUTF32BE [implementation]
@@ -223,17 +236,17 @@ NS_IMETHODIMP nsUnicodeToUTF32::FillInfo
NS_IMETHODIMP nsUnicodeToUTF32BE::Convert(const PRUnichar * aSrc,
PRInt32 * aSrcLength,
char * aDest,
PRInt32 * aDestLength)
{
return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength,
- &mHighSurrogate, PR_FALSE);
+ &mHighSurrogate, &mBOM, PR_FALSE);
}
NS_IMETHODIMP nsUnicodeToUTF32BE::Finish(char * aDest,
PRInt32 * aDestLength)
{
return FinishCommon(aDest, aDestLength, &mHighSurrogate, PR_FALSE);
}
@@ -246,17 +259,17 @@ NS_IMETHODIMP nsUnicodeToUTF32BE::Finish
NS_IMETHODIMP nsUnicodeToUTF32LE::Convert(const PRUnichar * aSrc,
PRInt32 * aSrcLength,
char * aDest,
PRInt32 * aDestLength)
{
return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength,
- &mHighSurrogate, PR_TRUE);
+ &mHighSurrogate, &mBOM, PR_TRUE);
}
NS_IMETHODIMP nsUnicodeToUTF32LE::Finish(char * aDest,
PRInt32 * aDestLength)
{
return FinishCommon(aDest, aDestLength, &mHighSurrogate, PR_TRUE);
}
--- a/intl/uconv/ucvlatin/nsUnicodeToUTF32.h
+++ b/intl/uconv/ucvlatin/nsUnicodeToUTF32.h
@@ -40,63 +40,64 @@
#ifndef nsUnicodeToUTF32_h___
#define nsUnicodeToUTF32_h___
//----------------------------------------------------------------------
// Class nsUnicodeToUTF32 [declaration]
/**
- * A character set converter from UTF32 to Unicode.
- * The base class for UTF32BE/UTF32LE to Unicode converters.
+ * A character set converter from UTF-32 family to Unicode.
+ * The base class for UTF-32/UTF-32BE/UTF-32LE to Unicode converters.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
-class nsUnicodeToUTF32 : public nsIUnicodeEncoder
+class nsUnicodeToUTF32Base : public nsIUnicodeEncoder
{
NS_DECL_ISUPPORTS
-public:
+protected:
/**
- * Class constructor.
+ * Class constructor. accessible only by child classes
*/
- nsUnicodeToUTF32() {mHighSurrogate = 0;}
- virtual ~nsUnicodeToUTF32() {}
+ nsUnicodeToUTF32Base() {mBOM = 0; mHighSurrogate = 0;}
+ virtual ~nsUnicodeToUTF32Base() {}
-protected:
PRUnichar mHighSurrogate;
NS_IMETHOD GetMaxLength(const PRUnichar * aSrc, PRInt32 aSrcLength,
PRInt32 * aDestLength);
//--------------------------------------------------------------------
// Subclassing of nsIUnicodeEncoder class [declaration]
- NS_IMETHOD Reset() {mHighSurrogate = 0; return NS_OK;}
+ NS_IMETHOD Reset() {mBOM = 0; mHighSurrogate = 0; return NS_OK;}
NS_IMETHOD FillInfo(PRUint32* aInfo);
NS_IMETHOD SetOutputErrorBehavior(PRInt32 aBehavior,
nsIUnicharEncoder * aEncoder,
PRUnichar aChar)
{return NS_OK;}
+protected:
+ PRUnichar mBOM;
};
//----------------------------------------------------------------------
// Class nsUnicodeToUTF32BE [declaration]
/**
- * A character set converter from Unicode to UTF32BE.
- * A subclass of UnicodeToUTF32.
+ * A character set converter from Unicode to UTF-32BE.
+ * A subclass of UnicodeToUTF32Base.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
-class nsUnicodeToUTF32BE : public nsUnicodeToUTF32
+class nsUnicodeToUTF32BE : public nsUnicodeToUTF32Base
{
public:
//--------------------------------------------------------------------
// Subclassing of nsIUnicodeEncoder class [declaration]
NS_IMETHOD Convert(const PRUnichar * aSrc, PRInt32 * aSrcLength,
char * aDest, PRInt32 * aDestLength);
@@ -104,28 +105,54 @@ public:
};
//----------------------------------------------------------------------
// Class nsUnicodeToUTF32LE [declaration]
/**
- * A character set converter from Unicode to UTF32LE.
- * A subclass of UnicodeToUTF32.
+ * A character set converter from Unicode to UTF-32LE.
+ * A subclass of UnicodeToUTF32Base.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
-class nsUnicodeToUTF32LE : public nsUnicodeToUTF32
+class nsUnicodeToUTF32LE : public nsUnicodeToUTF32Base
{
public:
//--------------------------------------------------------------------
// Subclassing of nsIUnicodeEncoder class [declaration]
NS_IMETHOD Convert(const PRUnichar * aSrc, PRInt32 * aSrcLength,
char * aDest, PRInt32 * aDestLength);
NS_IMETHOD Finish(char * aDest, PRInt32 * aDestLength);
};
+//----------------------------------------------------------------------
+// Class nsUnicodeToUTF32 [declaration]
+
+/**
+ * A character set converter from Unicode to UTF-32.
+ * A subclass of UnicodeToUTF32Base.
+ * @created 08/Dec/2002
+ * @author Jungshik Shin
+ */
+#ifdef IS_LITTLE_ENDIAN
+class nsUnicodeToUTF32 : public nsUnicodeToUTF32LE
+#elif defined(IS_BIG_ENDIAN)
+class nsUnicodeToUTF32 : public nsUnicodeToUTF32BE
+#else
+#error "Unknown endianness"
+#endif
+{
+public:
+ nsUnicodeToUTF32() {mBOM = 0xFEFF; mHighSurrogate = 0;};
+
+ //--------------------------------------------------------------------
+ // Subclassing of nsUnicodeToUTF32Base class [declaration]
+ NS_IMETHOD Reset() {mBOM = 0xFEFF; mHighSurrogate = 0; return NS_OK;};
+
+};
+
#endif /* nsUnicodeToUTF32_h___ */
--- a/layout/base/tests/test_bug399284.html
+++ b/layout/base/tests/test_bug399284.html
@@ -29,17 +29,17 @@ SimpleTest.waitForExplicitFinish();
while (decoderList.hasMore()) {
var decoder = decoderList.getNext();
// encode the content for non-ASCII compatible encodings
if (decoder == "UTF-16" || decoder == "UTF-16BE")
data = encodeUTF16BE(testContent);
else if (decoder == "UTF-16LE")
data = encodeUTF16LE(testContent);
- else if (decoder == "UTF-32BE")
+ else if (decoder == "UTF-32" || decoder == "UTF-32BE")
data = encodeUTF32BE(testContent);
else if (decoder == "UTF-32LE")
data = encodeUTF32LE(testContent);
else
data = encodeURI(testContent);
var dataURI = "data:text/html;charset=" + decoder + "," + data;
var testFrame = document.createElement("iframe");
--- a/layout/style/nsCSSLoader.cpp
+++ b/layout/style/nsCSSLoader.cpp
@@ -451,16 +451,17 @@ static nsresult GetCharsetFromData(const
PRUint32 aDataLength,
nsACString& aCharset)
{
aCharset.Truncate();
if (aDataLength <= sizeof(kCharsetSym) - 1)
return NS_ERROR_NOT_AVAILABLE;
PRUint32 step = 1;
PRUint32 pos = 0;
+ PRBool bigEndian = PR_FALSE;
// Determine the encoding type. If we have a BOM, set aCharset to the
// charset listed for that BOM in http://www.w3.org/TR/REC-xml#sec-guessing;
// that way even if we don't have a valid @charset rule we can use the BOM to
// get a reasonable charset. If we do have an @charset rule, the string from
// that will override this fallback setting of aCharset.
if (*aStyleSheetData == 0x40 && *(aStyleSheetData+1) == 0x63 /* '@c' */ ) {
// 1-byte ASCII-based encoding (ISO-8859-*, UTF-8, etc), no BOM
step = 1;
@@ -484,36 +485,28 @@ static nsresult GetCharsetFromData(const
aStyleSheetData[3] == 0x00) {
// 4-byte encoding BOM in 3412 order
NS_WARNING("Our unicode decoders aren't likely to deal with this one");
step = 4;
pos = 5;
aCharset = "UTF-32";
}
else if (nsContentUtils::CheckForBOM(aStyleSheetData,
- aDataLength, aCharset)) {
+ aDataLength, aCharset, &bigEndian)) {
if (aCharset.Equals("UTF-8")) {
step = 1;
pos = 3;
}
- else if (aCharset.Equals("UTF-32BE")) {
+ else if (aCharset.Equals("UTF-32")) {
step = 4;
- pos = 7;
- }
- else if (aCharset.Equals("UTF-32LE")) {
- step = 4;
- pos = 4;
+ pos = bigEndian ? 7 : 4;
}
- else if (aCharset.Equals("UTF-16BE")) {
+ else if (aCharset.Equals("UTF-16")) {
step = 2;
- pos = 3;
- }
- else if (aCharset.Equals("UTF-16LE")) {
- step = 2;
- pos = 2;
+ pos = bigEndian ? 3 : 2;
}
}
else if (aStyleSheetData[0] == 0x00 &&
aStyleSheetData[1] == 0x00 &&
aStyleSheetData[2] == 0x00 &&
aStyleSheetData[3] == 0x40) {
// big-endian 4-byte encoding, no BOM
step = 4;
--- a/netwerk/streamconv/converters/nsUnknownDecoder.cpp
+++ b/netwerk/streamconv/converters/nsUnknownDecoder.cpp
@@ -563,21 +563,20 @@ PRBool nsUnknownDecoder::LastDitchSniff(
// application/octet-stream
// First, check for a BOM. If we see one, assume this is text/plain
// in whatever encoding. If there is a BOM _and_ text we will
// always have at least 4 bytes in the buffer (since the 2-byte BOMs
// are for 2-byte encodings and the UTF-8 BOM is 3 bytes).
if (mBufferLen >= 4) {
const unsigned char* buf = (const unsigned char*)mBuffer;
- if ((buf[0] == 0xFE && buf[1] == 0xFF) || // UTF-16BE
- (buf[0] == 0xFF && buf[1] == 0xFE) || // UTF-16LE
+ if ((buf[0] == 0xFE && buf[1] == 0xFF) || // UTF-16, Big Endian
+ (buf[0] == 0xFF && buf[1] == 0xFE) || // UTF-16 or UCS-4, Little Endian
(buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) || // UTF-8
- (buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF) || // UCS-4BE
- (buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFF && buf[3] == 0xFE)) { // UCS-4
+ (buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF)) { // UCS-4, Big Endian
mContentType = TEXT_PLAIN;
return PR_TRUE;
}
}
// Now see whether the buffer has any non-text chars. If not, then let's
// just call it text/plain...
--- a/parser/htmlparser/src/nsParser.cpp
+++ b/parser/htmlparser/src/nsParser.cpp
@@ -2397,18 +2397,20 @@ nsParser::OnStartRequest(nsIRequest *req
OnStartRequest(request, ctx);
}
}
return rv;
}
+#define UTF16_BOM "UTF-16"
#define UTF16_BE "UTF-16BE"
#define UTF16_LE "UTF-16LE"
+#define UCS4_BOM "UTF-32"
#define UCS4_BE "UTF-32BE"
#define UCS4_LE "UTF-32LE"
#define UCS4_2143 "X-ISO-10646-UCS-4-2143"
#define UCS4_3412 "X-ISO-10646-UCS-4-3412"
#define UTF8 "UTF-8"
static inline PRBool IsSecondMarker(unsigned char aChar)
{
@@ -2436,17 +2438,17 @@ DetectByteOrderMark(const unsigned char*
// UCS2 BOM FEFF = UTF8 EF BB BF
switch(aBytes[0])
{
case 0x00:
if(0x00==aBytes[1]) {
// 00 00
if((0xFE==aBytes[2]) && (0xFF==aBytes[3])) {
// 00 00 FE FF UCS-4, big-endian machine (1234 order)
- oCharset.Assign(UCS4_BE);
+ oCharset.Assign(UCS4_BOM);
} else if((0x00==aBytes[2]) && (0x3C==aBytes[3])) {
// 00 00 00 3C UCS-4, big-endian machine (1234 order)
oCharset.Assign(UCS4_BE);
} else if((0xFF==aBytes[2]) && (0xFE==aBytes[3])) {
// 00 00 FF FE UCS-4, unusual octet order (2143)
oCharset.Assign(UCS4_2143);
} else if((0x3C==aBytes[2]) && (0x00==aBytes[3])) {
// 00 00 3C 00 UCS-4, unusual octet order (2143)
@@ -2567,30 +2569,30 @@ DetectByteOrderMark(const unsigned char*
break;
case 0xFE:
if(0xFF==aBytes[1]) {
if(0x00==aBytes[2] && 0x00==aBytes[3]) {
// FE FF 00 00 UCS-4, unusual octet order (3412)
oCharset.Assign(UCS4_3412);
} else {
// FE FF UTF-16, big-endian
- oCharset.Assign(UTF16_BE);
+ oCharset.Assign(UTF16_BOM);
}
oCharsetSource= kCharsetFromByteOrderMark;
}
break;
case 0xFF:
if(0xFE==aBytes[1]) {
if(0x00==aBytes[2] && 0x00==aBytes[3])
// FF FE 00 00 UTF-32, little-endian
- oCharset.Assign(UCS4_LE);
+ oCharset.Assign(UCS4_BOM);
else
// FF FE
// UTF-16, little-endian
- oCharset.Assign(UTF16_LE);
+ oCharset.Assign(UTF16_BOM);
oCharsetSource= kCharsetFromByteOrderMark;
}
break;
// case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
// We do not care EBCIDIC here....
// }
// break;
} // switch
@@ -2775,16 +2777,17 @@ ParserWriteFunc(nsIInputStream* in,
result = alias->GetPreferred(guess, preferred);
// Only continue if it's a recognized charset and not
// one of a designated set that we ignore.
if (NS_SUCCEEDED(result) &&
((kCharsetFromByteOrderMark == guessSource) ||
(!preferred.EqualsLiteral("UTF-16") &&
!preferred.EqualsLiteral("UTF-16BE") &&
!preferred.EqualsLiteral("UTF-16LE") &&
+ !preferred.EqualsLiteral("UTF-32") &&
!preferred.EqualsLiteral("UTF-32BE") &&
!preferred.EqualsLiteral("UTF-32LE")))) {
guess = preferred;
pws->mParser->SetDocumentCharset(guess, guessSource);
pws->mParser->SetSinkCharset(preferred);
nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
if (channel) {
nsCOMPtr<nsISupports> cacheToken;
--- a/toolkit/locales/en-US/chrome/global/intl.properties
+++ b/toolkit/locales/en-US/chrome/global/intl.properties
@@ -24,16 +24,16 @@ font.language.group=x-western
intl.accept_languages=en-us, en
intl.charsetmenu.browser.static=ISO-8859-1, UTF-8
intl.charsetmenu.browser.more1=ISO-8859-1, ISO-8859-15, IBM850, x-mac-roman, windows-1252, ISO-8859-14, ISO-8859-7, x-mac-greek, windows-1253, x-mac-icelandic, ISO-8859-10, ISO-8859-3
intl.charsetmenu.browser.more2=ISO-8859-4, ISO-8859-13, windows-1257, IBM852, ISO-8859-2, x-mac-ce, windows-1250, x-mac-croatian, IBM855, ISO-8859-5, ISO-IR-111, KOI8-R, x-mac-cyrillic, windows-1251, IBM866, KOI8-U, x-mac-ukrainian, ISO-8859-16, x-mac-romanian
intl.charsetmenu.browser.more3=GB2312, x-gbk, gb18030, HZ-GB-2312, ISO-2022-CN, Big5, Big5-HKSCS, x-euc-tw, EUC-JP, ISO-2022-JP, Shift_JIS, EUC-KR, x-windows-949, x-johab, ISO-2022-KR
intl.charsetmenu.browser.more4=armscii-8, GEOSTD8, TIS-620, ISO-8859-11, windows-874, IBM857, ISO-8859-9, x-mac-turkish, windows-1254, x-viet-tcvn5712, VISCII, x-viet-vps, windows-1258, x-mac-devanagari, x-mac-gujarati, x-mac-gurmukhi
intl.charsetmenu.browser.more5=ISO-8859-6, windows-1256, IBM864, x-mac-arabic, x-mac-farsi, ISO-8859-8-I, windows-1255, ISO-8859-8, IBM862, x-mac-hebrew
# Localization Note: Never change the following entry.
-intl.charsetmenu.browser.unicode=UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE
+intl.charsetmenu.browser.unicode=UTF-8, UTF-16LE, UTF-16BE, UTF-32, UTF-32LE, UTF-32BE
intl.charset.default=ISO-8859-1
intl.charset.detector=
intl.charsetmenu.mailedit=ISO-8859-1, ISO-8859-15, ISO-8859-6, armscii-8, geostd8, ISO-8859-13, ISO-8859-14, ISO-8859-2, GB2312, GB18030, Big5, KOI8-R, windows-1251, KOI8-U, ISO-8859-7, ISO-8859-8-I, windows-1255, ISO-2022-JP, EUC-KR, ISO-8859-10, ISO-8859-3, TIS-620, ISO-8859-9, UTF-8, VISCII
# valid intl.menuitems.appendedacceskeys are: true or false, <empty string> (missing or empty preference equals false)
intl.menuitems.alwaysappendaccesskeys=
# valid intl.menuitems.insertseparatorbeforeaccesskeys are: true or false, <empty string> (missing or empty preference equals false)
intl.menuitems.insertseparatorbeforeaccesskeys=true