Bug 820767 - Recognize plausible legacy Java-style encoding names and comment the alias file. r=jorgk DONTBUILD BETA_BASE_20181210
authorHenri Sivonen <hsivonen@hsivonen.fi>
Sun, 09 Dec 2018 01:47:00 +0100
changeset 33871 0b7555f983a525273e921e3b1553f366f58ddb84
parent 33870 4ec85e9cf4a02424ac12f97e4665fd5bb3fa6fbb
child 33872 60b6390f25cc1265b22091760b9ec8e4277cb9fb
push id388
push userclokep@gmail.com
push dateMon, 28 Jan 2019 20:54:56 +0000
reviewersjorgk
bugs820767
Bug 820767 - Recognize plausible legacy Java-style encoding names and comment the alias file. r=jorgk DONTBUILD * ms-prefixed labels for code pages in common with DOS and Windows (excl 866) * cp-prefixed labels for code pages in common with DOS and Windows (group existing) * No-hyphen label for ISO-2022-JP * Underscore labels for Unix CJK encodings * Remove some aliases for encodings that aren't supported * Map ISO-8859-1 aliases to windows-1252 * Correct the case of gbk to GBK * Group UTF-7 labels together * Document all entries (even old ones)
mailnews/intl/charsetalias.properties
--- a/mailnews/intl/charsetalias.properties
+++ b/mailnews/intl/charsetalias.properties
@@ -1,99 +1,166 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 # Rule of this file:
 # 1. key should always be in lower case ascii so we can do case insensitive
 #    comparison in the code faster.
-# 2. value should be the one used in unicode converter
-# 3. If the charset is not used for document charset, but font charset
-#    (e.g. XLFD charset- such as JIS x0201, JIS x0208), don't put here
+# 2. value should be the _name_ used in the WHATWG Encoding Standard
+#    https://encoding.spec.whatwg.org/ (of "UTF-7" for UTF-7).
 #
-# This file contains mainly aliases. Actual labels for encodings are in
-# labelsencodings.properties. Besides aliases it contains labels for charsets
-# that are not part of the HTML5 world, but still are supported for e-mail.
+# This file contains email-specific labels. Web-relevant labels for
+# encodings are in the Encoding Standard / encoding_rs.
 
+# Added for Solaris ns_langinfo. Unlikely relevant to email.
+# https://bugzilla.mozilla.org/show_bug.cgi?id=77300#c9
 646=windows-1252
-iso-8859-1=ISO-8859-1
-utf-16=UTF-16
-utf-7=UTF-7
 
 # Netscape private ...
+# This should probably no longer be in this file.
 x-imap4-modified-utf7=x-imap4-modified-utf7
+
+# Unsupported for outgoing mail but still used for testing in
+# composition/test-charset-edit.js and referenced in charsetData.properties.
 x-mac-ce=x-mac-ce
 x-mac-turkish=x-mac-turkish
 x-mac-greek=x-mac-greek
 x-mac-icelandic=x-mac-icelandic
 x-mac-croatian=x-mac-croatian
 x-mac-romanian=x-mac-romanian
 x-mac-hebrew=x-mac-hebrew
 x-mac-arabic=x-mac-arabic
 x-mac-farsi=x-mac-farsi
 x-mac-devanagari=x-mac-devanagari
 x-mac-gujarati=x-mac-gujarati
 x-mac-gurmukhi=x-mac-gurmukhi
-iso-10646-ucs-2=UTF-16BE
-x-iso-10646-ucs-2-be=UTF-16BE
-x-iso-10646-ucs-2-le=UTF-16LE
-
-# Aliases for ISO-8859-1
-latin1=ISO-8859-1
-iso_8859-1=ISO-8859-1
-iso8859-1=ISO-8859-1
-iso_8859-1:1987=ISO-8859-1
-iso-ir-100=ISO-8859-1
-l1=ISO-8859-1
-cp819=ISO-8859-1
-csisolatin1=ISO-8859-1
 
 # Aliases for ISO-8859-8-I
+# From the original IBM bidi patch.
 iso-8859-8i=ISO-8859-8-I
 
-# Aliases for Shift_JIS
+# ISO 8859 series with underscore for JavaMail
+# compat.
+# https://bugzilla.mozilla.org/show_bug.cgi?id=820767
+iso8859_1=windows-1252
+iso8859_2=ISO-8859-2
+iso8859_3=ISO-8859-3
+iso8859_4=ISO-8859-4
+iso8859_5=ISO-8859-5
+iso8859_6=ISO-8859-6
+iso8859_7=ISO-8859-7
+# Unclear if 8 with underscore was visual or not
+iso8859_9=windows-1254
+# No evidence of 10 occurring with underscore
+# 11 is tis620
+# 12 does not exist
+iso8859_13=ISO-8859-13
+# No evidence of 14 occurring with underscore
+iso8859_15=ISO-8859-15
+# No evidence of 16 occurring with underscore
+
+# Code pages shared by DOS and Windows with ms prefix.
+# Evidence of this pattern in the wild:
+# https://bugzilla.mozilla.org/show_bug.cgi?id=1120813
+# Plausible cause: JavaMail
+# The ms prefix as used by Sun is not relevant to windows-125x series
+ms874=windows-874
+# ms932 was added to the Encoding Standard as one-off Thunderbird request
+# MS936 shows up at https://www.iana.org/assignments/character-sets/character-sets.xhtml
+ms936=GBK
+ms949=EUC-KR
+ms950=Big5
+ms950_hkscs=Big5
+
+# Underscore versions of Unix CJK encodings.
+# No evidence of these in the wild, but these could plausibly
+# occur for the same reason as the above two groups.
+euc_cn=GBK
+euc_kr=EUC-KR
+euc_jp=EUC-JP
+big5_hkscs=Big5
+
+# Code pages shared by DOS and Windows with cp prefix
+# cp125x series are in the Encoding Standard
+# Evidence of the pattern in the wild:
+# https://bugzilla.mozilla.org/show_bug.cgi?id=1511950
+# https://bugzilla.mozilla.org/show_bug.cgi?id=542823
+# https://bugzilla.mozilla.org/show_bug.cgi?id=1217161
+cp874=windows-874
 cp932=Shift_JIS
+# CP936 shows up at https://www.iana.org/assignments/character-sets/character-sets.xhtml
+cp936=GBK
+cp949=EUC-KR
+cp950=Big5
 
 # Aliases for ISO-2022-JP
 # The following are really not aliases ISO-2022-JP, but sharing the same decoder
+# Kept mainly for compat with old Apple Mail.
 iso-2022-jp-2=ISO-2022-JP
 csiso2022jp2=ISO-2022-JP
+# A Google search suggests the variant without hyphens has been used with
+# JavaMail.
+iso2022jp=ISO-2022-JP
 
 # Aliases for Big5
-# x-x-big5 is not really a alias for Big5, add it only for MS FrontPage
-# Sun Solaris
-
+# Added in patch that generally meant to support emails sent by
+# dtmail on Sun Solaris
+# https://bugzilla.mozilla.org/show_bug.cgi?id=146287
 zh_tw-big5=Big5
 
 # Aliases for EUC-KR
+# Added for Solaris ns_langinfo. Unlikely relevant to email.
+# https://bugzilla.mozilla.org/show_bug.cgi?id=82075
 5601=EUC-KR
 
-# Aliases for windows-874 
+# Aliases for windows-874
+# Added originally for nl_langinfo reasons but could plausibly be sent
+# by JavaMail.
+# https://bugzilla.mozilla.org/show_bug.cgi?id=101295
 tis620=windows-874
 
 # Aliases for IBM866
+# This alias may have been made up by accident and may
+# not be relevant to real-world email.
+# https://bugzilla.mozilla.org/show_bug.cgi?id=77588
 cp-866=IBM866
 
 # Aliases for UTF-7
+utf-7=UTF-7
+# The below 4 aliases were not in Thunderbird 60, and there were
+# no complaints.
+# This alias appears to have been generated by the email part
+# of the Netscape 4.0 suite per http://jkorpela.fi/chars.html
 x-unicode-2-0-utf-7=UTF-7
+# This appears to be just a made-up non-x version of the above
+# (checked in without bug number).
 unicode-2-0-utf-7=UTF-7
+# The two aliases below show up at
+# https://www.iana.org/assignments/character-sets/character-sets.xhtml
 unicode-1-1-utf-7=UTF-7
 csunicode11utf7=UTF-7
 
-# Aliases for ISO-10646-UCS-2
+# The below aliases were not in Thunderbird 60, and there were
+# no complaints.
+# These aliases show up at
+# https://www.iana.org/assignments/character-sets/character-sets.xhtml
 csunicode=UTF-16BE
 csunicode11=UTF-16BE
 iso-10646-ucs-basic=UTF-16BE
 csunicodeascii=UTF-16BE
 iso-10646-unicode-latin1=UTF-16BE
 csunicodelatin1=UTF-16BE
 iso-10646=UTF-16BE
 iso-10646-j-1=UTF-16BE
+iso-10646-ucs-2=UTF-16BE
+# Netscape aliases checked in without bug number.
+# Possibly meant to be Netscape-private.
+x-iso-10646-ucs-2-be=UTF-16BE
+x-iso-10646-ucs-2-le=UTF-16LE
 
-# Following names appears in unix nl_langinfo(CODESET)
-# They can be compiled as platform specific if necessary
-# DON'T put things here if it does not look generic enough (like hp15CN)
-iso88591=ISO-8859-1
-iso885912=ISO-8859-12
-windows-936=gbk
+# Shows up at https://www.iana.org/assignments/character-sets/character-sets.xhtml
+# https://bugzilla.mozilla.org/show_bug.cgi?id=651113
+windows-936=GBK
+
+# Added for Solaris ns_langinfo(). Unlikely to be relevant to email.
+# https://bugzilla.mozilla.org/show_bug.cgi?id=82075
 ansi-1251=windows-1251
-
-cp936=gbk