Bug 1275284 - Update twitter-text library to 1.14. r=clokep a=jorgk
authoraleth <aleth@instantbird.org>
Sun, 16 Oct 2016 22:12:06 +0200
changeset 27513 88269bbc6b03fc3e7defef871713527f5cb96922
parent 27512 75e7b63716981dfdb764f36dc83fd2fbb28ff3ee
child 27514 1f5a404e07ebbf650601db9f71c0d4c4b5254ffd
child 27516 da07a8fc38cfbfb1b5b4a9ed04469b2f50c4d8e9
push id1850
push userclokep@gmail.com
push dateWed, 08 Mar 2017 19:29:12 +0000
treeherdercomm-esr52@028df196b2d9 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersclokep, jorgk
bugs1275284
Bug 1275284 - Update twitter-text library to 1.14. r=clokep a=jorgk
chat/protocols/twitter/twitter-text.jsm
--- a/chat/protocols/twitter/twitter-text.jsm
+++ b/chat/protocols/twitter/twitter-text.jsm
@@ -14,17 +14,17 @@
  */
 
 this.EXPORTED_SYMBOLS = ["twttr"];
 
 var window = {};
 
 // The code below is imported from Twitter's JavaScript utility for parsing
 // tweets. The original version of this file can be found at
-// https://github.com/twitter/twitter-text-js/blob/master/twitter-text.js
+// https://github.com/twitter/twitter-text/blob/master/js/twitter-text.js
 
 (function() {
   if (typeof twttr === "undefined" || twttr === null) {
     var twttr = {};
   }
 
   twttr.txt = {};
   twttr.txt.regexen = {};
@@ -76,170 +76,47 @@ var window = {};
   function stringSupplant(str, values) {
     return str.replace(/#\{(\w+)\}/g, function(match, name) {
       return values[name] || "";
     });
   }
 
   twttr.txt.stringSupplant = stringSupplant;
 
-  function addCharsToCharClass(charClass, start, end) {
-    var s = String.fromCharCode(start);
-    if (end !== start) {
-      s += "-" + String.fromCharCode(end);
-    }
-    charClass.push(s);
-    return charClass;
-  }
-
-  twttr.txt.addCharsToCharClass = addCharsToCharClass;
-
-  // Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
-  // to access both the list of characters and a pattern suitible for use with String#split
-  // Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
-  var fromCode = String.fromCharCode;
-  var UNICODE_SPACES = [
-    fromCode(0x0020), // White_Space # Zs       SPACE
-    fromCode(0x0085), // White_Space # Cc       <control-0085>
-    fromCode(0x00A0), // White_Space # Zs       NO-BREAK SPACE
-    fromCode(0x1680), // White_Space # Zs       OGHAM SPACE MARK
-    fromCode(0x180E), // White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
-    fromCode(0x2028), // White_Space # Zl       LINE SEPARATOR
-    fromCode(0x2029), // White_Space # Zp       PARAGRAPH SEPARATOR
-    fromCode(0x202F), // White_Space # Zs       NARROW NO-BREAK SPACE
-    fromCode(0x205F), // White_Space # Zs       MEDIUM MATHEMATICAL SPACE
-    fromCode(0x3000)  // White_Space # Zs       IDEOGRAPHIC SPACE
-  ];
-  addCharsToCharClass(UNICODE_SPACES, 0x009, 0x00D); // White_Space # Cc   [5] <control-0009>..<control-000D>
-  addCharsToCharClass(UNICODE_SPACES, 0x2000, 0x200A); // White_Space # Zs  [11] EN QUAD..HAIR SPACE
-
-  var INVALID_CHARS = [
-    fromCode(0xFFFE),
-    fromCode(0xFEFF), // BOM
-    fromCode(0xFFFF) // Special
-  ];
-  addCharsToCharClass(INVALID_CHARS, 0x202A, 0x202E); // Directional change
-
-  twttr.txt.regexen.spaces_group = regexSupplant(UNICODE_SPACES.join(""));
-  twttr.txt.regexen.spaces = regexSupplant("[" + UNICODE_SPACES.join("") + "]");
-  twttr.txt.regexen.invalid_chars_group = regexSupplant(INVALID_CHARS.join(""));
+  twttr.txt.regexen.spaces_group = /\x09-\x0D\x20\x85\xA0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000/;
+  twttr.txt.regexen.spaces = regexSupplant(/[#{spaces_group}]/);
+  twttr.txt.regexen.invalid_chars_group = /\uFFFE\uFEFF\uFFFF\u202A-\u202E/;
+  twttr.txt.regexen.invalid_chars = regexSupplant(/[#{invalid_chars_group}]/);
   twttr.txt.regexen.punct = /\!'#%&'\(\)*\+,\\\-\.\/:;<=>\?@\[\]\^_{|}~\$/;
   twttr.txt.regexen.rtl_chars = /[\u0600-\u06FF]|[\u0750-\u077F]|[\u0590-\u05FF]|[\uFE70-\uFEFF]/mg;
   twttr.txt.regexen.non_bmp_code_pairs = /[\uD800-\uDBFF][\uDC00-\uDFFF]/mg;
 
-  var nonLatinHashtagChars = [];
-  // Cyrillic
-  addCharsToCharClass(nonLatinHashtagChars, 0x0400, 0x04ff); // Cyrillic
-  addCharsToCharClass(nonLatinHashtagChars, 0x0500, 0x0527); // Cyrillic Supplement
-  addCharsToCharClass(nonLatinHashtagChars, 0x2de0, 0x2dff); // Cyrillic Extended A
-  addCharsToCharClass(nonLatinHashtagChars, 0xa640, 0xa69f); // Cyrillic Extended B
-  // Hebrew
-  addCharsToCharClass(nonLatinHashtagChars, 0x0591, 0x05bf); // Hebrew
-  addCharsToCharClass(nonLatinHashtagChars, 0x05c1, 0x05c2);
-  addCharsToCharClass(nonLatinHashtagChars, 0x05c4, 0x05c5);
-  addCharsToCharClass(nonLatinHashtagChars, 0x05c7, 0x05c7);
-  addCharsToCharClass(nonLatinHashtagChars, 0x05d0, 0x05ea);
-  addCharsToCharClass(nonLatinHashtagChars, 0x05f0, 0x05f4);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfb12, 0xfb28); // Hebrew Presentation Forms
-  addCharsToCharClass(nonLatinHashtagChars, 0xfb2a, 0xfb36);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfb38, 0xfb3c);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfb3e, 0xfb3e);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfb40, 0xfb41);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfb43, 0xfb44);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfb46, 0xfb4f);
-  // Arabic
-  addCharsToCharClass(nonLatinHashtagChars, 0x0610, 0x061a); // Arabic
-  addCharsToCharClass(nonLatinHashtagChars, 0x0620, 0x065f);
-  addCharsToCharClass(nonLatinHashtagChars, 0x066e, 0x06d3);
-  addCharsToCharClass(nonLatinHashtagChars, 0x06d5, 0x06dc);
-  addCharsToCharClass(nonLatinHashtagChars, 0x06de, 0x06e8);
-  addCharsToCharClass(nonLatinHashtagChars, 0x06ea, 0x06ef);
-  addCharsToCharClass(nonLatinHashtagChars, 0x06fa, 0x06fc);
-  addCharsToCharClass(nonLatinHashtagChars, 0x06ff, 0x06ff);
-  addCharsToCharClass(nonLatinHashtagChars, 0x0750, 0x077f); // Arabic Supplement
-  addCharsToCharClass(nonLatinHashtagChars, 0x08a0, 0x08a0); // Arabic Extended A
-  addCharsToCharClass(nonLatinHashtagChars, 0x08a2, 0x08ac);
-  addCharsToCharClass(nonLatinHashtagChars, 0x08e4, 0x08fe);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfb50, 0xfbb1); // Arabic Pres. Forms A
-  addCharsToCharClass(nonLatinHashtagChars, 0xfbd3, 0xfd3d);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfd50, 0xfd8f);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfd92, 0xfdc7);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfdf0, 0xfdfb);
-  addCharsToCharClass(nonLatinHashtagChars, 0xfe70, 0xfe74); // Arabic Pres. Forms B
-  addCharsToCharClass(nonLatinHashtagChars, 0xfe76, 0xfefc);
-  addCharsToCharClass(nonLatinHashtagChars, 0x200c, 0x200c); // Zero-Width Non-Joiner
-  // Thai
-  addCharsToCharClass(nonLatinHashtagChars, 0x0e01, 0x0e3a);
-  addCharsToCharClass(nonLatinHashtagChars, 0x0e40, 0x0e4e);
-  // Hangul (Korean)
-  addCharsToCharClass(nonLatinHashtagChars, 0x1100, 0x11ff); // Hangul Jamo
-  addCharsToCharClass(nonLatinHashtagChars, 0x3130, 0x3185); // Hangul Compatibility Jamo
-  addCharsToCharClass(nonLatinHashtagChars, 0xA960, 0xA97F); // Hangul Jamo Extended-A
-  addCharsToCharClass(nonLatinHashtagChars, 0xAC00, 0xD7AF); // Hangul Syllables
-  addCharsToCharClass(nonLatinHashtagChars, 0xD7B0, 0xD7FF); // Hangul Jamo Extended-B
-  addCharsToCharClass(nonLatinHashtagChars, 0xFFA1, 0xFFDC); // half-width Hangul
-  // Japanese and Chinese
-  addCharsToCharClass(nonLatinHashtagChars, 0x30A1, 0x30FA); // Katakana (full-width)
-  addCharsToCharClass(nonLatinHashtagChars, 0x30FC, 0x30FE); // Katakana Chouon and iteration marks (full-width)
-  addCharsToCharClass(nonLatinHashtagChars, 0xFF66, 0xFF9F); // Katakana (half-width)
-  addCharsToCharClass(nonLatinHashtagChars, 0xFF70, 0xFF70); // Katakana Chouon (half-width)
-  addCharsToCharClass(nonLatinHashtagChars, 0xFF10, 0xFF19); // \
-  addCharsToCharClass(nonLatinHashtagChars, 0xFF21, 0xFF3A); //  - Latin (full-width)
-  addCharsToCharClass(nonLatinHashtagChars, 0xFF41, 0xFF5A); // /
-  addCharsToCharClass(nonLatinHashtagChars, 0x3041, 0x3096); // Hiragana
-  addCharsToCharClass(nonLatinHashtagChars, 0x3099, 0x309E); // Hiragana voicing and iteration mark
-  addCharsToCharClass(nonLatinHashtagChars, 0x3400, 0x4DBF); // Kanji (CJK Extension A)
-  addCharsToCharClass(nonLatinHashtagChars, 0x4E00, 0x9FFF); // Kanji (Unified)
-  // -- Disabled as it breaks the Regex.
-  //addCharsToCharClass(nonLatinHashtagChars, 0x20000, 0x2A6DF); // Kanji (CJK Extension B)
-  addCharsToCharClass(nonLatinHashtagChars, 0x2A700, 0x2B73F); // Kanji (CJK Extension C)
-  addCharsToCharClass(nonLatinHashtagChars, 0x2B740, 0x2B81F); // Kanji (CJK Extension D)
-  addCharsToCharClass(nonLatinHashtagChars, 0x2F800, 0x2FA1F); // Kanji (CJK supplement)
-  addCharsToCharClass(nonLatinHashtagChars, 0x3003, 0x3003); // Kanji iteration mark
-  addCharsToCharClass(nonLatinHashtagChars, 0x3005, 0x3005); // Kanji iteration mark
-  addCharsToCharClass(nonLatinHashtagChars, 0x303B, 0x303B); // Han iteration mark
+  twttr.txt.regexen.latinAccentChars = /\xC0-\xD6\xD8-\xF6\xF8-\xFF\u0100-\u024F\u0253\u0254\u0256\u0257\u0259\u025B\u0263\u0268\u026F\u0272\u0289\u028B\u02BB\u0300-\u036F\u1E00-\u1EFF/;
+
+  // Generated from unicode_regex/unicode_regex_groups.scala, same as objective c's \p{L}\p{M}
+  twttr.txt.regexen.bmpLetterAndMarks = /A-Za-z\xaa\xb5\xba\xc0-\xd6\xd8-\xf6\xf8-\u02c1\u02c6-\u02d1\u02e0-\u02e4\u02ec\u02ee\u0300-\u0374\u0376\u0377\u037a-\u037d\u037f\u0386\u0388-\u038a\u038c\u038e-\u03a1\u03a3-\u03f5\u03f7-\u0481\u0483-\u052f\u0531-\u0556\u0559\u0561-\u0587\u0591-\u05bd\u05bf\u05c1\u05c2\u05c4\u05c5\u05c7\u05d0-\u05ea\u05f0-\u05f2\u0610-\u061a\u0620-\u065f\u066e-\u06d3\u06d5-\u06dc\u06df-\u06e8\u06ea-\u06ef\u06fa-\u06fc\u06ff\u0710-\u074a\u074d-\u07b1\u07ca-\u07f5\u07fa\u0800-\u082d\u0840-\u085b\u08a0-\u08b2\u08e4-\u0963\u0971-\u0983\u0985-\u098c\u098f\u0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bc-\u09c4\u09c7\u09c8\u09cb-\u09ce\u09d7\u09dc\u09dd\u09df-\u09e3\u09f0\u09f1\u0a01-\u0a03\u0a05-\u0a0a\u0a0f\u0a10\u0a13-\u0a28\u0a2a-\u0a30\u0a32\u0a33\u0a35\u0a36\u0a38\u0a39\u0a3c\u0a3e-\u0a42\u0a47\u0a48\u0a4b-\u0a4d\u0a51\u0a59-\u0a5c\u0a5e\u0a70-\u0a75\u0a81-\u0a83\u0a85-\u0a8d\u0a8f-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2\u0ab3\u0ab5-\u0ab9\u0abc-\u0ac5\u0ac7-\u0ac9\u0acb-\u0acd\u0ad0\u0ae0-\u0ae3\u0b01-\u0b03\u0b05-\u0b0c\u0b0f\u0b10\u0b13-\u0b28\u0b2a-\u0b30\u0b32\u0b33\u0b35-\u0b39\u0b3c-\u0b44\u0b47\u0b48\u0b4b-\u0b4d\u0b56\u0b57\u0b5c\u0b5d\u0b5f-\u0b63\u0b71\u0b82\u0b83\u0b85-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99\u0b9a\u0b9c\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8-\u0baa\u0bae-\u0bb9\u0bbe-\u0bc2\u0bc6-\u0bc8\u0bca-\u0bcd\u0bd0\u0bd7\u0c00-\u0c03\u0c05-\u0c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c39\u0c3d-\u0c44\u0c46-\u0c48\u0c4a-\u0c4d\u0c55\u0c56\u0c58\u0c59\u0c60-\u0c63\u0c81-\u0c83\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cbc-\u0cc4\u0cc6-\u0cc8\u0cca-\u0ccd\u0cd5\u0cd6\u0cde\u0ce0-\u0ce3\u0cf1\u0cf2\u0d01-\u0d03\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d3a\u0d3d-\u0d44\u0d46-\u0d48\u0d4a-\u0d4e\u0d57\u0d60-\u0d63\u0d7a-\u0d7f\u0d82\u0d83\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd\u0dc0-\u0dc6\u0dca\u0dcf-\u0dd4\u0dd6\u0dd8-\u0ddf\u0df2\u0df3\u0e01-\u0e3a\u0e40-\u0e4e\u0e81\u0e82\u0e84\u0e87\u0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa\u0eab\u0ead-\u0eb9\u0ebb-\u0ebd\u0ec0-\u0ec4\u0ec6\u0ec8-\u0ecd\u0edc-\u0edf\u0f00\u0f18\u0f19\u0f35\u0f37\u0f39\u0f3e-\u0f47\u0f49-\u0f6c\u0f71-\u0f84\u0f86-\u0f97\u0f99-\u0fbc\u0fc6\u1000-\u103f\u1050-\u108f\u109a-\u109d\u10a0-\u10c5\u10c7\u10cd\u10d0-\u10fa\u10fc-\u1248\u124a-\u124d\u1250-\u1256\u1258\u125a-\u125d\u1260-\u1288\u128a-\u128d\u1290-\u12b0\u12b2-\u12b5\u12b8-\u12be\u12c0\u12c2-\u12c5\u12c8-\u12d6\u12d8-\u1310\u1312-\u1315\u1318-\u135a\u135d-\u135f\u1380-\u138f\u13a0-\u13f4\u1401-\u166c\u166f-\u167f\u1681-\u169a\u16a0-\u16ea\u16f1-\u16f8\u1700-\u170c\u170e-\u1714\u1720-\u1734\u1740-\u1753\u1760-\u176c\u176e-\u1770\u1772\u1773\u1780-\u17d3\u17d7\u17dc\u17dd\u180b-\u180d\u1820-\u1877\u1880-\u18aa\u18b0-\u18f5\u1900-\u191e\u1920-\u192b\u1930-\u193b\u1950-\u196d\u1970-\u1974\u1980-\u19ab\u19b0-\u19c9\u1a00-\u1a1b\u1a20-\u1a5e\u1a60-\u1a7c\u1a7f\u1aa7\u1ab0-\u1abe\u1b00-\u1b4b\u1b6b-\u1b73\u1b80-\u1baf\u1bba-\u1bf3\u1c00-\u1c37\u1c4d-\u1c4f\u1c5a-\u1c7d\u1cd0-\u1cd2\u1cd4-\u1cf6\u1cf8\u1cf9\u1d00-\u1df5\u1dfc-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59\u1f5b\u1f5d\u1f5f-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe\u1fc2-\u1fc4\u1fc6-\u1fcc\u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2071\u207f\u2090-\u209c\u20d0-\u20f0\u2102\u2107\u210a-\u2113\u2115\u2119-\u211d\u2124\u2126\u2128\u212a-\u212d\u212f-\u2139\u213c-\u213f\u2145-\u2149\u214e\u2183\u2184\u2c00-\u2c2e\u2c30-\u2c5e\u2c60-\u2ce4\u2ceb-\u2cf3\u2d00-\u2d25\u2d27\u2d2d\u2d30-\u2d67\u2d6f\u2d7f-\u2d96\u2da0-\u2da6\u2da8-\u2dae\u2db0-\u2db6\u2db8-\u2dbe\u2dc0-\u2dc6\u2dc8-\u2dce\u2dd0-\u2dd6\u2dd8-\u2dde\u2de0-\u2dff\u2e2f\u3005\u3006\u302a-\u302f\u3031-\u3035\u303b\u303c\u3041-\u3096\u3099\u309a\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312d\u3131-\u318e\u31a0-\u31ba\u31f0-\u31ff\u3400-\u4db5\u4e00-\u9fcc\ua000-\ua48c\ua4d0-\ua4fd\ua500-\ua60c\ua610-\ua61f\ua62a\ua62b\ua640-\ua672\ua674-\ua67d\ua67f-\ua69d\ua69f-\ua6e5\ua6f0\ua6f1\ua717-\ua71f\ua722-\ua788\ua78b-\ua78e\ua790-\ua7ad\ua7b0\ua7b1\ua7f7-\ua827\ua840-\ua873\ua880-\ua8c4\ua8e0-\ua8f7\ua8fb\ua90a-\ua92d\ua930-\ua953\ua960-\ua97c\ua980-\ua9c0\ua9cf\ua9e0-\ua9ef\ua9fa-\ua9fe\uaa00-\uaa36\uaa40-\uaa4d\uaa60-\uaa76\uaa7a-\uaac2\uaadb-\uaadd\uaae0-\uaaef\uaaf2-\uaaf6\uab01-\uab06\uab09-\uab0e\uab11-\uab16\uab20-\uab26\uab28-\uab2e\uab30-\uab5a\uab5c-\uab5f\uab64\uab65\uabc0-\uabea\uabec\uabed\uac00-\ud7a3\ud7b0-\ud7c6\ud7cb-\ud7fb\uf870-\uf87f\uf882\uf884-\uf89f\uf8b8\uf8c1-\uf8d6\uf900-\ufa6d\ufa70-\ufad9\ufb00-\ufb06\ufb13-\ufb17\ufb1d-\ufb28\ufb2a-\ufb36\ufb38-\ufb3c\ufb3e\ufb40\ufb41\ufb43\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\ufd8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe00-\ufe0f\ufe20-\ufe2d\ufe70-\ufe74\ufe76-\ufefc\uff21-\uff3a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc/;
+  twttr.txt.regexen.astralLetterAndMarks = /\ud800[\udc00-\udc0b\udc0d-\udc26\udc28-\udc3a\udc3c\udc3d\udc3f-\udc4d\udc50-\udc5d\udc80-\udcfa\uddfd\ude80-\ude9c\udea0-\uded0\udee0\udf00-\udf1f\udf30-\udf40\udf42-\udf49\udf50-\udf7a\udf80-\udf9d\udfa0-\udfc3\udfc8-\udfcf]|\ud801[\udc00-\udc9d\udd00-\udd27\udd30-\udd63\ude00-\udf36\udf40-\udf55\udf60-\udf67]|\ud802[\udc00-\udc05\udc08\udc0a-\udc35\udc37\udc38\udc3c\udc3f-\udc55\udc60-\udc76\udc80-\udc9e\udd00-\udd15\udd20-\udd39\udd80-\uddb7\uddbe\uddbf\ude00-\ude03\ude05\ude06\ude0c-\ude13\ude15-\ude17\ude19-\ude33\ude38-\ude3a\ude3f\ude60-\ude7c\ude80-\ude9c\udec0-\udec7\udec9-\udee6\udf00-\udf35\udf40-\udf55\udf60-\udf72\udf80-\udf91]|\ud803[\udc00-\udc48]|\ud804[\udc00-\udc46\udc7f-\udcba\udcd0-\udce8\udd00-\udd34\udd50-\udd73\udd76\udd80-\uddc4\uddda\ude00-\ude11\ude13-\ude37\udeb0-\udeea\udf01-\udf03\udf05-\udf0c\udf0f\udf10\udf13-\udf28\udf2a-\udf30\udf32\udf33\udf35-\udf39\udf3c-\udf44\udf47\udf48\udf4b-\udf4d\udf57\udf5d-\udf63\udf66-\udf6c\udf70-\udf74]|\ud805[\udc80-\udcc5\udcc7\udd80-\uddb5\uddb8-\uddc0\ude00-\ude40\ude44\ude80-\udeb7]|\ud806[\udca0-\udcdf\udcff\udec0-\udef8]|\ud808[\udc00-\udf98]|\ud80c[\udc00-\udfff]|\ud80d[\udc00-\udc2e]|\ud81a[\udc00-\ude38\ude40-\ude5e\uded0-\udeed\udef0-\udef4\udf00-\udf36\udf40-\udf43\udf63-\udf77\udf7d-\udf8f]|\ud81b[\udf00-\udf44\udf50-\udf7e\udf8f-\udf9f]|\ud82c[\udc00\udc01]|\ud82f[\udc00-\udc6a\udc70-\udc7c\udc80-\udc88\udc90-\udc99\udc9d\udc9e]|\ud834[\udd65-\udd69\udd6d-\udd72\udd7b-\udd82\udd85-\udd8b\uddaa-\uddad\ude42-\ude44]|\ud835[\udc00-\udc54\udc56-\udc9c\udc9e\udc9f\udca2\udca5\udca6\udca9-\udcac\udcae-\udcb9\udcbb\udcbd-\udcc3\udcc5-\udd05\udd07-\udd0a\udd0d-\udd14\udd16-\udd1c\udd1e-\udd39\udd3b-\udd3e\udd40-\udd44\udd46\udd4a-\udd50\udd52-\udea5\udea8-\udec0\udec2-\udeda\udedc-\udefa\udefc-\udf14\udf16-\udf34\udf36-\udf4e\udf50-\udf6e\udf70-\udf88\udf8a-\udfa8\udfaa-\udfc2\udfc4-\udfcb]|\ud83a[\udc00-\udcc4\udcd0-\udcd6]|\ud83b[\ude00-\ude03\ude05-\ude1f\ude21\ude22\ude24\ude27\ude29-\ude32\ude34-\ude37\ude39\ude3b\ude42\ude47\ude49\ude4b\ude4d-\ude4f\ude51\ude52\ude54\ude57\ude59\ude5b\ude5d\ude5f\ude61\ude62\ude64\ude67-\ude6a\ude6c-\ude72\ude74-\ude77\ude79-\ude7c\ude7e\ude80-\ude89\ude8b-\ude9b\udea1-\udea3\udea5-\udea9\udeab-\udebb]|\ud840[\udc00-\udfff]|\ud841[\udc00-\udfff]|\ud842[\udc00-\udfff]|\ud843[\udc00-\udfff]|\ud844[\udc00-\udfff]|\ud845[\udc00-\udfff]|\ud846[\udc00-\udfff]|\ud847[\udc00-\udfff]|\ud848[\udc00-\udfff]|\ud849[\udc00-\udfff]|\ud84a[\udc00-\udfff]|\ud84b[\udc00-\udfff]|\ud84c[\udc00-\udfff]|\ud84d[\udc00-\udfff]|\ud84e[\udc00-\udfff]|\ud84f[\udc00-\udfff]|\ud850[\udc00-\udfff]|\ud851[\udc00-\udfff]|\ud852[\udc00-\udfff]|\ud853[\udc00-\udfff]|\ud854[\udc00-\udfff]|\ud855[\udc00-\udfff]|\ud856[\udc00-\udfff]|\ud857[\udc00-\udfff]|\ud858[\udc00-\udfff]|\ud859[\udc00-\udfff]|\ud85a[\udc00-\udfff]|\ud85b[\udc00-\udfff]|\ud85c[\udc00-\udfff]|\ud85d[\udc00-\udfff]|\ud85e[\udc00-\udfff]|\ud85f[\udc00-\udfff]|\ud860[\udc00-\udfff]|\ud861[\udc00-\udfff]|\ud862[\udc00-\udfff]|\ud863[\udc00-\udfff]|\ud864[\udc00-\udfff]|\ud865[\udc00-\udfff]|\ud866[\udc00-\udfff]|\ud867[\udc00-\udfff]|\ud868[\udc00-\udfff]|\ud869[\udc00-\uded6\udf00-\udfff]|\ud86a[\udc00-\udfff]|\ud86b[\udc00-\udfff]|\ud86c[\udc00-\udfff]|\ud86d[\udc00-\udf34\udf40-\udfff]|\ud86e[\udc00-\udc1d]|\ud87e[\udc00-\ude1d]|\udb40[\udd00-\uddef]/;
+
+  // Generated from unicode_regex/unicode_regex_groups.scala, same as objective c's \p{Nd}
+  twttr.txt.regexen.bmpNumerals = /0-9\u0660-\u0669\u06f0-\u06f9\u07c0-\u07c9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be6-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0de6-\u0def\u0e50-\u0e59\u0ed0-\u0ed9\u0f20-\u0f29\u1040-\u1049\u1090-\u1099\u17e0-\u17e9\u1810-\u1819\u1946-\u194f\u19d0-\u19d9\u1a80-\u1a89\u1a90-\u1a99\u1b50-\u1b59\u1bb0-\u1bb9\u1c40-\u1c49\u1c50-\u1c59\ua620-\ua629\ua8d0-\ua8d9\ua900-\ua909\ua9d0-\ua9d9\ua9f0-\ua9f9\uaa50-\uaa59\uabf0-\uabf9\uff10-\uff19/;
+  twttr.txt.regexen.astralNumerals = /\ud801[\udca0-\udca9]|\ud804[\udc66-\udc6f\udcf0-\udcf9\udd36-\udd3f\uddd0-\uddd9\udef0-\udef9]|\ud805[\udcd0-\udcd9\ude50-\ude59\udec0-\udec9]|\ud806[\udce0-\udce9]|\ud81a[\ude60-\ude69\udf50-\udf59]|\ud835[\udfce-\udfff]/;
 
-  twttr.txt.regexen.nonLatinHashtagChars = regexSupplant(nonLatinHashtagChars.join(""));
+  twttr.txt.regexen.hashtagSpecialChars = /_\u200c\u200d\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\xb7/;
 
-  var latinAccentChars = [];
-  // Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
-  addCharsToCharClass(latinAccentChars, 0x00c0, 0x00d6);
-  addCharsToCharClass(latinAccentChars, 0x00d8, 0x00f6);
-  addCharsToCharClass(latinAccentChars, 0x00f8, 0x00ff);
-  // Latin Extended A and B
-  addCharsToCharClass(latinAccentChars, 0x0100, 0x024f);
-  // assorted IPA Extensions
-  addCharsToCharClass(latinAccentChars, 0x0253, 0x0254);
-  addCharsToCharClass(latinAccentChars, 0x0256, 0x0257);
-  addCharsToCharClass(latinAccentChars, 0x0259, 0x0259);
-  addCharsToCharClass(latinAccentChars, 0x025b, 0x025b);
-  addCharsToCharClass(latinAccentChars, 0x0263, 0x0263);
-  addCharsToCharClass(latinAccentChars, 0x0268, 0x0268);
-  addCharsToCharClass(latinAccentChars, 0x026f, 0x026f);
-  addCharsToCharClass(latinAccentChars, 0x0272, 0x0272);
-  addCharsToCharClass(latinAccentChars, 0x0289, 0x0289);
-  addCharsToCharClass(latinAccentChars, 0x028b, 0x028b);
-  // Okina for Hawaiian (it *is* a letter character)
-  addCharsToCharClass(latinAccentChars, 0x02bb, 0x02bb);
-  // Combining diacritics
-  addCharsToCharClass(latinAccentChars, 0x0300, 0x036f);
-  // Latin Extended Additional
-  addCharsToCharClass(latinAccentChars, 0x1e00, 0x1eff);
-  twttr.txt.regexen.latinAccentChars = regexSupplant(latinAccentChars.join(""));
-
-  // A hashtag must contain characters, numbers and underscores, but not all numbers.
+  // A hashtag must contain at least one unicode letter or mark, as well as numbers, underscores, and select special characters.
   twttr.txt.regexen.hashSigns = /[##]/;
-  twttr.txt.regexen.hashtagAlpha = regexSupplant(/[a-z_#{latinAccentChars}#{nonLatinHashtagChars}]/i);
-  twttr.txt.regexen.hashtagAlphaNumeric = regexSupplant(/[a-z0-9_#{latinAccentChars}#{nonLatinHashtagChars}]/i);
+  twttr.txt.regexen.hashtagAlpha = regexSupplant(/(?:[#{bmpLetterAndMarks}]|(?=#{non_bmp_code_pairs})(?:#{astralLetterAndMarks}))/);
+  twttr.txt.regexen.hashtagAlphaNumeric = regexSupplant(/(?:[#{bmpLetterAndMarks}#{bmpNumerals}#{hashtagSpecialChars}]|(?=#{non_bmp_code_pairs})(?:#{astralLetterAndMarks}|#{astralNumerals}))/);
   twttr.txt.regexen.endHashtagMatch = regexSupplant(/^(?:#{hashSigns}|:\/\/)/);
-  twttr.txt.regexen.hashtagBoundary = regexSupplant(/(?:^|$|[^&a-z0-9_#{latinAccentChars}#{nonLatinHashtagChars}])/);
-  twttr.txt.regexen.validHashtag = regexSupplant(/(#{hashtagBoundary})(#{hashSigns})(#{hashtagAlphaNumeric}*#{hashtagAlpha}#{hashtagAlphaNumeric}*)/gi);
+  twttr.txt.regexen.codePoint = /(?:[^\uD800-\uDFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF])/;
+  twttr.txt.regexen.hashtagBoundary = regexSupplant(/(?:^|$|(?!#{hashtagAlphaNumeric}|&)#{codePoint})/);
+  twttr.txt.regexen.validHashtag = regexSupplant(/(#{hashtagBoundary})(#{hashSigns})(?!\uFE0F|\u20E3)(#{hashtagAlphaNumeric}*#{hashtagAlpha}#{hashtagAlphaNumeric}*)/gi);
 
   // Mention related regex collection
-  twttr.txt.regexen.validMentionPrecedingChars = /(?:^|[^a-zA-Z0-9_!#$%&*@@]|RT:?)/;
+  twttr.txt.regexen.validMentionPrecedingChars = /(?:^|[^a-zA-Z0-9_!#$%&*@@]|(?:^|[^a-zA-Z0-9_+~.-])(?:rt|RT|rT|Rt):?)/;
   twttr.txt.regexen.atSigns = /[@@]/;
   twttr.txt.regexen.validMentionOrList = regexSupplant(
     '(#{validMentionPrecedingChars})' +  // $1: Preceding character
     '(#{atSigns})' +                     // $2: At mark
     '([a-zA-Z0-9_]{1,20})' +             // $3: Screen name
     '(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?'  // $4: List (optional)
   , 'g');
   twttr.txt.regexen.validReply = regexSupplant(/^(?:#{spaces})*#{atSigns}([a-zA-Z0-9_]{1,20})/);
@@ -247,40 +124,141 @@ var window = {};
 
   // URL related regex collection
   twttr.txt.regexen.validUrlPrecedingChars = regexSupplant(/(?:[^A-Za-z0-9@@$###{invalid_chars_group}]|^)/);
   twttr.txt.regexen.invalidUrlWithoutProtocolPrecedingChars = /[-_.\/]$/;
   twttr.txt.regexen.invalidDomainChars = stringSupplant("#{punct}#{spaces_group}#{invalid_chars_group}", twttr.txt.regexen);
   twttr.txt.regexen.validDomainChars = regexSupplant(/[^#{invalidDomainChars}]/);
   twttr.txt.regexen.validSubdomain = regexSupplant(/(?:(?:#{validDomainChars}(?:[_-]|#{validDomainChars})*)?#{validDomainChars}\.)/);
   twttr.txt.regexen.validDomainName = regexSupplant(/(?:(?:#{validDomainChars}(?:-|#{validDomainChars})*)?#{validDomainChars}\.)/);
-  twttr.txt.regexen.validGTLD = regexSupplant(/(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-zA-Z]|$))/);
+  twttr.txt.regexen.validGTLD = regexSupplant(RegExp(
+'(?:(?:' +
+    '삼성|닷컴|닷넷|香格里拉|餐厅|食品|飞利浦|電訊盈科|集团|购物|谷歌|诺基亚|联通|网络|网站|网店|网址|组织机构|移动|珠宝|点看|游戏|淡马锡|机构|書籍|时尚|新闻|政府|政务|' +
+    '手表|手机|我爱你|慈善|微博|广东|工行|家電|娱乐|大拿|在线|嘉里大酒店|嘉里|商标|商店|商城|公益|公司|八卦|健康|信息|佛山|企业|中文网|中信|世界|ポイント|ファッション|' +
+    'セール|ストア|コム|グーグル|クラウド|みんな|คอม|संगठन|नेट|कॉम|همراه|موقع|موبايلي|كوم|شبكة|بيتك|بازار|العليان|' +
+    'ارامكو|ابوظبي|קום|сайт|рус|орг|онлайн|москва|ком|дети|zuerich|zone|zippo|zip|zero|zara|zappos|' +
+    'yun|youtube|you|yokohama|yoga|yodobashi|yandex|yamaxun|yahoo|yachts|xyz|xxx|xperia|xin|xihuan|' +
+    'xfinity|xerox|xbox|wtf|wtc|world|works|work|woodside|wolterskluwer|wme|wine|windows|win|' +
+    'williamhill|wiki|wien|whoswho|weir|weibo|wedding|wed|website|weber|webcam|weatherchannel|' +
+    'weather|watches|watch|warman|wanggou|wang|walter|wales|vuelos|voyage|voto|voting|vote|' +
+    'volkswagen|vodka|vlaanderen|viva|vistaprint|vista|vision|virgin|vip|vin|villas|viking|vig|video|' +
+    'viajes|vet|versicherung|vermögensberatung|vermögensberater|verisign|ventures|vegas|vana|' +
+    'vacations|ups|uol|uno|university|unicom|ubs|tvs|tushu|tunes|tui|tube|trv|trust|' +
+    'travelersinsurance|travelers|travelchannel|travel|training|trading|trade|toys|toyota|town|tours|' +
+    'total|toshiba|toray|top|tools|tokyo|today|tmall|tirol|tires|tips|tiffany|tienda|tickets|theatre|' +
+    'theater|thd|teva|tennis|temasek|telefonica|telecity|tel|technology|tech|team|tdk|tci|taxi|tax|' +
+    'tattoo|tatar|tatamotors|taobao|talk|taipei|tab|systems|symantec|sydney|swiss|swatch|suzuki|' +
+    'surgery|surf|support|supply|supplies|sucks|style|study|studio|stream|store|storage|stockholm|' +
+    'stcgroup|stc|statoil|statefarm|statebank|starhub|star|stada|srl|spreadbetting|spot|spiegel|' +
+    'space|soy|sony|song|solutions|solar|sohu|software|softbank|social|soccer|sncf|smile|skype|sky|' +
+    'skin|ski|site|singles|sina|silk|shriram|show|shouji|shopping|shop|shoes|shiksha|shia|shell|shaw|' +
+    'sharp|shangrila|sfr|sexy|sex|sew|seven|services|sener|select|seek|security|seat|scot|scor|' +
+    'science|schwarz|schule|school|scholarships|schmidt|schaeffler|scb|sca|sbs|sbi|saxo|save|sas|' +
+    'sarl|sapo|sap|sanofi|sandvikcoromant|sandvik|samsung|salon|sale|sakura|safety|safe|saarland|' +
+    'ryukyu|rwe|run|ruhr|rsvp|room|rodeo|rocks|rocher|rip|rio|ricoh|richardli|rich|rexroth|reviews|' +
+    'review|restaurant|rest|republican|report|repair|rentals|rent|ren|reit|reisen|reise|rehab|' +
+    'redumbrella|redstone|red|recipes|realty|realtor|realestate|read|racing|quest|quebec|qpon|pwc|' +
+    'pub|protection|property|properties|promo|progressive|prof|productions|prod|pro|prime|press|' +
+    'praxi|post|porn|politie|poker|pohl|pnc|plus|plumbing|playstation|play|place|pizza|pioneer|pink|' +
+    'ping|pin|pid|pictures|pictet|pics|piaget|physio|photos|photography|photo|philips|pharmacy|pet|' +
+    'pccw|passagens|party|parts|partners|pars|paris|panerai|pamperedchef|page|ovh|ott|otsuka|osaka|' +
+    'origins|orientexpress|organic|org|orange|oracle|ooo|online|onl|ong|one|omega|ollo|olayangroup|' +
+    'olayan|okinawa|office|obi|nyc|ntt|nrw|nra|nowtv|nowruz|now|norton|northwesternmutual|nokia|' +
+    'nissay|nissan|ninja|nikon|nico|nhk|ngo|nfl|nexus|nextdirect|next|news|new|neustar|network|' +
+    'netflix|netbank|net|nec|navy|natura|name|nagoya|nadex|mutuelle|mutual|museum|mtr|mtpc|mtn|' +
+    'movistar|movie|mov|motorcycles|moscow|mortgage|mormon|montblanc|money|monash|mom|moi|moe|moda|' +
+    'mobily|mobi|mma|mls|mlb|mitsubishi|mit|mini|mil|microsoft|miami|metlife|meo|menu|men|memorial|' +
+    'meme|melbourne|meet|media|med|mba|mattel|marriott|markets|marketing|market|mango|management|man|' +
+    'makeup|maison|maif|madrid|luxury|luxe|lupin|ltda|ltd|love|lotto|lotte|london|lol|locus|locker|' +
+    'loans|loan|lixil|living|live|lipsy|link|linde|lincoln|limo|limited|like|lighting|lifestyle|' +
+    'lifeinsurance|life|lidl|liaison|lgbt|lexus|lego|legal|leclerc|lease|lds|lawyer|law|latrobe|lat|' +
+    'lasalle|lanxess|landrover|land|lancaster|lamer|lamborghini|lacaixa|kyoto|kuokgroup|kred|krd|kpn|' +
+    'kpmg|kosher|komatsu|koeln|kiwi|kitchen|kindle|kinder|kim|kia|kfh|kerryproperties|kerrylogistics|' +
+    'kerryhotels|kddi|kaufen|juegos|jprs|jpmorgan|joy|jot|joburg|jobs|jnj|jmp|jll|jlc|jewelry|jetzt|' +
+    'jcp|jcb|java|jaguar|iwc|itv|itau|istanbul|ist|ismaili|iselect|irish|ipiranga|investments|' +
+    'international|int|insure|insurance|institute|ink|ing|info|infiniti|industries|immobilien|immo|' +
+    'imdb|imamat|ikano|iinet|ifm|icu|ice|icbc|ibm|hyundai|htc|hsbc|how|house|hotmail|hoteles|hosting|' +
+    'host|horse|honda|homes|homedepot|holiday|holdings|hockey|hkt|hiv|hitachi|hisamitsu|hiphop|hgtv|' +
+    'hermes|here|helsinki|help|healthcare|health|hdfcbank|haus|hangout|hamburg|guru|guitars|guide|' +
+    'guge|gucci|guardian|group|gripe|green|gratis|graphics|grainger|gov|got|gop|google|goog|goodyear|' +
+    'goo|golf|goldpoint|gold|godaddy|gmx|gmo|gmbh|gmail|globo|global|gle|glass|giving|gives|gifts|' +
+    'gift|ggee|genting|gent|gea|gdn|gbiz|garden|games|game|gallup|gallo|gallery|gal|fyi|futbol|' +
+    'furniture|fund|fujitsu|ftr|frontier|frontdoor|frogans|frl|fresenius|fox|foundation|forum|' +
+    'forsale|forex|ford|football|foodnetwork|foo|fly|flsmidth|flowers|florist|flir|flights|flickr|' +
+    'fitness|fit|fishing|fish|firmdale|firestone|fire|financial|finance|final|film|ferrero|feedback|' +
+    'fedex|fast|fashion|farmers|farm|fans|fan|family|faith|fairwinds|fail|fage|extraspace|express|' +
+    'exposed|expert|exchange|everbank|events|eus|eurovision|estate|esq|erni|ericsson|equipment|epson|' +
+    'epost|enterprises|engineering|engineer|energy|emerck|email|education|edu|edeka|eat|earth|dvag|' +
+    'durban|dupont|dunlop|dubai|dtv|drive|download|dot|doosan|domains|doha|dog|docs|dnp|discount|' +
+    'directory|direct|digital|diet|diamonds|dhl|dev|design|desi|dentist|dental|democrat|delta|' +
+    'deloitte|dell|delivery|degree|deals|dealer|deal|dds|dclk|day|datsun|dating|date|dance|dad|dabur|' +
+    'cyou|cymru|cuisinella|csc|cruises|crs|crown|cricket|creditunion|creditcard|credit|courses|' +
+    'coupons|coupon|country|corsica|coop|cool|cookingchannel|cooking|contractors|contact|consulting|' +
+    'construction|condos|comsec|computer|compare|company|community|commbank|comcast|com|cologne|' +
+    'college|coffee|codes|coach|clubmed|club|cloud|clothing|clinique|clinic|click|cleaning|claims|' +
+    'cityeats|city|citic|cisco|circle|cipriani|church|chrome|christmas|chloe|chintai|cheap|chat|' +
+    'chase|channel|chanel|cfd|cfa|cern|ceo|center|ceb|cbre|cbn|cba|catering|cat|casino|cash|casa|' +
+    'cartier|cars|careers|career|care|cards|caravan|car|capital|capetown|canon|cancerresearch|camp|' +
+    'camera|cam|call|cal|cafe|cab|bzh|buzz|buy|business|builders|build|bugatti|budapest|brussels|' +
+    'brother|broker|broadway|bridgestone|bradesco|boutique|bot|bostik|bosch|boots|book|boo|bond|bom|' +
+    'boehringer|boats|bnpparibas|bnl|bmw|bms|blue|bloomberg|blog|blanco|blackfriday|black|biz|bio|' +
+    'bingo|bing|bike|bid|bible|bharti|bet|best|berlin|bentley|beer|beats|bcn|bcg|bbva|bbc|bayern|' +
+    'bauhaus|bargains|barefoot|barclays|barclaycard|barcelona|bar|bank|band|baidu|baby|azure|axa|aws|' +
+    'avianca|autos|auto|author|audio|audible|audi|auction|attorney|associates|asia|arte|art|arpa|' +
+    'army|archi|aramco|aquarelle|apple|app|apartments|anz|anquan|android|analytics|amsterdam|amica|' +
+    'alstom|alsace|ally|allfinanz|alipay|alibaba|akdn|airtel|airforce|airbus|aig|agency|agakhan|afl|' +
+    'aetna|aero|aeg|adult|ads|adac|actor|active|aco|accountants|accountant|accenture|academy|' +
+    'abudhabi|abogado|able|abbvie|abbott|abb|aarp|aaa|onion' +
+')(?=[^0-9a-zA-Z@]|$))'));
   twttr.txt.regexen.validCCTLD = regexSupplant(RegExp(
-        "(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|" +
-        "ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|" +
-        "ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|" +
-        "ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|" +
-        "na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|" +
-        "sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|" +
-        "ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)(?=[^0-9a-zA-Z]|$))"));
-  twttr.txt.regexen.validPunycode = regexSupplant(/(?:xn--[0-9a-z]+)/);
+'(?:(?:' +
+    '한국|香港|澳門|新加坡|台灣|台湾|中國|中国|გე|ไทย|ලංකා|ഭാരതം|ಭಾರತ|భారత్|சிங்கப்பூர்|இலங்கை|இந்தியா|ଭାରତ|ભારત|ਭਾਰਤ|' +
+    'ভাৰত|ভারত|বাংলা|भारत|پاکستان|مليسيا|مصر|قطر|فلسطين|عمان|عراق|سورية|سودان|تونس|بھارت|ایران|' +
+    'امارات|المغرب|السعودية|الجزائر|الاردن|հայ|қаз|укр|срб|рф|мон|мкд|ею|бел|бг|ελ|zw|zm|za|yt|ye|ws|' +
+    'wf|vu|vn|vi|vg|ve|vc|va|uz|uy|us|um|uk|ug|ua|tz|tw|tv|tt|tr|tp|to|tn|tm|tl|tk|tj|th|tg|tf|td|tc|' +
+    'sz|sy|sx|sv|su|st|ss|sr|so|sn|sm|sl|sk|sj|si|sh|sg|se|sd|sc|sb|sa|rw|ru|rs|ro|re|qa|py|pw|pt|ps|' +
+    'pr|pn|pm|pl|pk|ph|pg|pf|pe|pa|om|nz|nu|nr|np|no|nl|ni|ng|nf|ne|nc|na|mz|my|mx|mw|mv|mu|mt|ms|mr|' +
+    'mq|mp|mo|mn|mm|ml|mk|mh|mg|mf|me|md|mc|ma|ly|lv|lu|lt|ls|lr|lk|li|lc|lb|la|kz|ky|kw|kr|kp|kn|km|' +
+    'ki|kh|kg|ke|jp|jo|jm|je|it|is|ir|iq|io|in|im|il|ie|id|hu|ht|hr|hn|hm|hk|gy|gw|gu|gt|gs|gr|gq|gp|' +
+    'gn|gm|gl|gi|gh|gg|gf|ge|gd|gb|ga|fr|fo|fm|fk|fj|fi|eu|et|es|er|eh|eg|ee|ec|dz|do|dm|dk|dj|de|cz|' +
+    'cy|cx|cw|cv|cu|cr|co|cn|cm|cl|ck|ci|ch|cg|cf|cd|cc|ca|bz|by|bw|bv|bt|bs|br|bq|bo|bn|bm|bl|bj|bi|' +
+    'bh|bg|bf|be|bd|bb|ba|az|ax|aw|au|at|as|ar|aq|ao|an|am|al|ai|ag|af|ae|ad|ac' +
+')(?=[^0-9a-zA-Z@]|$))'));
+  twttr.txt.regexen.validPunycode = /(?:xn--[0-9a-z]+)/;
+  twttr.txt.regexen.validSpecialCCTLD = /(?:(?:co|tv)(?=[^0-9a-zA-Z@]|$))/;
   twttr.txt.regexen.validDomain = regexSupplant(/(?:#{validSubdomain}*#{validDomainName}(?:#{validGTLD}|#{validCCTLD}|#{validPunycode}))/);
   twttr.txt.regexen.validAsciiDomain = regexSupplant(/(?:(?:[\-a-z0-9#{latinAccentChars}]+)\.)+(?:#{validGTLD}|#{validCCTLD}|#{validPunycode})/gi);
-  twttr.txt.regexen.invalidShortDomain = regexSupplant(/^#{validDomainName}#{validCCTLD}$/);
-
-  twttr.txt.regexen.validPortNumber = regexSupplant(/[0-9]+/);
-
-  twttr.txt.regexen.validGeneralUrlPathChars = regexSupplant(/[a-z0-9!\*';:=\+,\.\$\/%#\[\]\-_~@|&#{latinAccentChars}]/i);
-  // Allow URL paths to contain balanced parens
+  twttr.txt.regexen.invalidShortDomain = regexSupplant(/^#{validDomainName}#{validCCTLD}$/i);
+  twttr.txt.regexen.validSpecialShortDomain = regexSupplant(/^#{validDomainName}#{validSpecialCCTLD}$/i);
+  twttr.txt.regexen.validPortNumber = /[0-9]+/;
+  twttr.txt.regexen.cyrillicLettersAndMarks = /\u0400-\u04FF/;
+  twttr.txt.regexen.validGeneralUrlPathChars = regexSupplant(/[a-z#{cyrillicLettersAndMarks}0-9!\*';:=\+,\.\$\/%#\[\]\-_~@\|&#{latinAccentChars}]/i);
+  // Allow URL paths to contain up to two nested levels of balanced parens
   //  1. Used in Wikipedia URLs like /Primer_(film)
   //  2. Used in IIS sessions like /S(dfd346)/
-  twttr.txt.regexen.validUrlBalancedParens = regexSupplant(/\(#{validGeneralUrlPathChars}+\)/i);
+  //  3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
+  twttr.txt.regexen.validUrlBalancedParens = regexSupplant(
+    '\\('                                   +
+      '(?:'                                 +
+        '#{validGeneralUrlPathChars}+'      +
+        '|'                                 +
+        // allow one nested level of balanced parentheses
+        '(?:'                               +
+          '#{validGeneralUrlPathChars}*'    +
+          '\\('                             +
+            '#{validGeneralUrlPathChars}+'  +
+          '\\)'                             +
+          '#{validGeneralUrlPathChars}*'    +
+        ')'                                 +
+      ')'                                   +
+    '\\)'
+  , 'i');
   // Valid end-of-path chracters (so /foo. does not gobble the period).
   // 1. Allow =&# for empty URL parameters and other URL-join artifacts
-  twttr.txt.regexen.validUrlPathEndingChars = regexSupplant(/[\+\-a-z0-9=_#\/#{latinAccentChars}]|(?:#{validUrlBalancedParens})/i);
+  twttr.txt.regexen.validUrlPathEndingChars = regexSupplant(/[\+\-a-z#{cyrillicLettersAndMarks}0-9=_#\/#{latinAccentChars}]|(?:#{validUrlBalancedParens})/i);
   // Allow @ in a url, but only in the middle. Catch things like http://example.com/@user/
   twttr.txt.regexen.validUrlPath = regexSupplant('(?:' +
     '(?:' +
       '#{validGeneralUrlPathChars}*' +
         '(?:#{validUrlBalancedParens}#{validGeneralUrlPathChars}*)*' +
         '#{validUrlPathEndingChars}'+
       ')|(?:@#{validGeneralUrlPathChars}+\/)'+
     ')', 'i');
@@ -304,17 +282,17 @@ var window = {};
   twttr.txt.regexen.urlHasProtocol = /^https?:\/\//i;
   twttr.txt.regexen.urlHasHttps = /^https:\/\//i;
 
   // cashtag related regex
   twttr.txt.regexen.cashtag = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i;
   twttr.txt.regexen.validCashtag = regexSupplant('(^|#{spaces})(\\$)(#{cashtag})(?=$|\\s|[#{punct}])', 'gi');
 
   // These URL validation pattern strings are based on the ABNF from RFC 3986
-  twttr.txt.regexen.validateUrlUnreserved = /[a-z0-9\-._~]/i;
+  twttr.txt.regexen.validateUrlUnreserved = /[a-z\u0400-\u04FF0-9\-._~]/i;
   twttr.txt.regexen.validateUrlPctEncoded = /(?:%[0-9a-f]{2})/i;
   twttr.txt.regexen.validateUrlSubDelims = /[!$&'()*+,;=]/i;
   twttr.txt.regexen.validateUrlPchar = regexSupplant('(?:' +
     '#{validateUrlUnreserved}|' +
     '#{validateUrlPctEncoded}|' +
     '#{validateUrlSubDelims}|' +
     '[:|@]' +
   ')', 'i');
@@ -473,17 +451,17 @@ var window = {};
 
   twttr.txt.linkToHashtag = function(entity, text, options) {
     var hash = text.substring(entity.indices[0], entity.indices[0] + 1);
     var hashtag = twttr.txt.htmlEscape(entity.hashtag);
     var attrs = clone(options.htmlAttrs || {});
     attrs.href = options.hashtagUrlBase + hashtag;
     attrs.title = "#" + hashtag;
     attrs["class"] = options.hashtagClass;
-    if (hashtag[0].match(twttr.txt.regexen.rtl_chars)){
+    if (hashtag.charAt(0).match(twttr.txt.regexen.rtl_chars)){
       attrs["class"] += " rtl";
     }
     if (options.targetBlank) {
       attrs.target = '_blank';
     }
 
     return twttr.txt.linkToTextWithSymbol(entity, hash, hashtag, attrs, options);
   };
@@ -678,32 +656,44 @@ var window = {};
       }
       beginIndex = entity.indices[1];
     }
     result += nonEntity(text.substring(beginIndex, text.length));
     return result;
   };
 
   twttr.txt.autoLinkWithJSON = function(text, json, options) {
+    // map JSON entity to twitter-text entity
+    if (json.user_mentions) {
+      for (var i = 0; i < json.user_mentions.length; i++) {
+        // this is a @mention
+        json.user_mentions[i].screenName = json.user_mentions[i].screen_name;
+      }
+    }
+
+    if (json.hashtags) {
+      for (var i = 0; i < json.hashtags.length; i++) {
+        // this is a #hashtag
+        json.hashtags[i].hashtag = json.hashtags[i].text;
+      }
+    }
+
+    if (json.symbols) {
+      for (var i = 0; i < json.symbols.length; i++) {
+        // this is a $CASH tag
+        json.symbols[i].cashtag = json.symbols[i].text;
+      }
+    }
+
     // concatenate all entities
     var entities = [];
     for (var key in json) {
       entities = entities.concat(json[key]);
     }
-    // map JSON entity to twitter-text entity
-    for (var i = 0; i < entities.length; i++) {
-      entity = entities[i];
-      if (entity.screen_name) {
-        // this is @mention
-        entity.screenName = entity.screen_name;
-      } else if (entity.text) {
-        // this is #hashtag
-        entity.hashtag = entity.text;
-      }
-    }
+
     // modify indices to UTF-16
     twttr.txt.modifyIndicesFromUnicodeToUTF16(text, entities);
 
     return twttr.txt.autoLinkEntities(text, entities, options);
   };
 
   twttr.txt.extractHtmlAttrsFromOptions = function(options) {
     var htmlAttrs = {};
@@ -856,17 +846,16 @@ var window = {};
 
     return urlsOnly;
   };
 
   twttr.txt.extractUrlsWithIndices = function(text, options) {
     if (!options) {
       options = {extractUrlsWithoutProtocol: true};
     }
-
     if (!text || (options.extractUrlsWithoutProtocol ? !text.match(/\./) : !text.match(/:/))) {
       return [];
     }
 
     var urls = [];
 
     while (twttr.txt.regexen.extractUrl.exec(text)) {
       var before = RegExp.$2, url = RegExp.$3, protocol = RegExp.$4, domain = RegExp.$5, path = RegExp.$7;
@@ -876,41 +865,38 @@ var window = {};
       // if protocol is missing and domain contains non-ASCII characters,
       // extract ASCII-only domains.
       if (!protocol) {
         if (!options.extractUrlsWithoutProtocol
             || before.match(twttr.txt.regexen.invalidUrlWithoutProtocolPrecedingChars)) {
           continue;
         }
         var lastUrl = null,
-            lastUrlInvalidMatch = false,
             asciiEndPosition = 0;
         domain.replace(twttr.txt.regexen.validAsciiDomain, function(asciiDomain) {
           var asciiStartPosition = domain.indexOf(asciiDomain, asciiEndPosition);
           asciiEndPosition = asciiStartPosition + asciiDomain.length;
           lastUrl = {
             url: asciiDomain,
             indices: [startPosition + asciiStartPosition, startPosition + asciiEndPosition]
           };
-          lastUrlInvalidMatch = asciiDomain.match(twttr.txt.regexen.invalidShortDomain);
-          if (!lastUrlInvalidMatch) {
+          if (path
+              || asciiDomain.match(twttr.txt.regexen.validSpecialShortDomain)
+              || !asciiDomain.match(twttr.txt.regexen.invalidShortDomain)) {
             urls.push(lastUrl);
           }
         });
 
         // no ASCII-only domain found. Skip the entire URL.
         if (lastUrl == null) {
           continue;
         }
 
         // lastUrl only contains domain. Need to add path and query if they exist.
         if (path) {
-          if (lastUrlInvalidMatch) {
-            urls.push(lastUrl);
-          }
           lastUrl.url = url.replace(domain, lastUrl.url);
           lastUrl.indices[1] = endPosition;
         }
       } else {
         // In the case of t.co URLs, don't allow additional path characters.
         if (url.match(twttr.txt.regexen.validTcoUrl)) {
           url = RegExp.lastMatch;
           endPosition = startPosition + url.length;
@@ -1170,54 +1156,37 @@ var window = {};
       }
     }
 
     return result;
   };
 
   var MAX_LENGTH = 140;
 
-  // Characters not allowed in Tweets
-  var INVALID_CHARACTERS = [
-    // BOM
-    fromCode(0xFFFE),
-    fromCode(0xFEFF),
-
-    // Special
-    fromCode(0xFFFF),
-
-    // Directional Change
-    fromCode(0x202A),
-    fromCode(0x202B),
-    fromCode(0x202C),
-    fromCode(0x202D),
-    fromCode(0x202E)
-  ];
-
   // Returns the length of Tweet text with consideration to t.co URL replacement
   // and chars outside the basic multilingual plane that use 2 UTF16 code points
   twttr.txt.getTweetLength = function(text, options) {
     if (!options) {
       options = {
           // These come from https://api.twitter.com/1/help/configuration.json
           // described by https://dev.twitter.com/docs/api/1/get/help/configuration
-          short_url_length: 22,
+          short_url_length: 23,
           short_url_length_https: 23
       };
     }
     var textLength = twttr.txt.getUnicodeTextLength(text),
         urlsWithIndices = twttr.txt.extractUrlsWithIndices(text);
     twttr.txt.modifyIndicesFromUTF16ToUnicode(text, urlsWithIndices);
 
     for (var i = 0; i < urlsWithIndices.length; i++) {
-    	// Subtract the length of the original URL
+      // Subtract the length of the original URL
       textLength += urlsWithIndices[i].indices[0] - urlsWithIndices[i].indices[1];
 
       // Add 23 characters for URL starting with https://
-      // Otherwise add 22 characters
+      // http:// URLs still use https://t.co so they are 23 characters as well
       if (urlsWithIndices[i].url.toLowerCase().match(twttr.txt.regexen.urlHasHttps)) {
          textLength += options.short_url_length_https;
       } else {
         textLength += options.short_url_length;
       }
     }
 
     return textLength;
@@ -1237,25 +1206,27 @@ var window = {};
       return "empty";
     }
 
     // Determine max length independent of URL length
     if (twttr.txt.getTweetLength(text) > MAX_LENGTH) {
       return "too_long";
     }
 
-    for (var i = 0; i < INVALID_CHARACTERS.length; i++) {
-      if (text.indexOf(INVALID_CHARACTERS[i]) >= 0) {
-        return "invalid_characters";
-      }
+    if (twttr.txt.hasInvalidCharacters(text)) {
+      return "invalid_characters";
     }
 
     return false;
   };
 
+  twttr.txt.hasInvalidCharacters = function(text) {
+    return twttr.txt.regexen.invalid_chars.test(text);
+  };
+
   twttr.txt.isValidTweetText = function(text) {
     return !twttr.txt.isInvalidTweet(text);
   };
 
   twttr.txt.isValidUsername = function(username) {
     if (!username) {
       return false;
     }
@@ -1334,16 +1305,20 @@ var window = {};
     // RegExp["$&"] is the text of the last match
     return (!string || (string.match(regex) && RegExp["$&"] === string));
   }
 
   if (typeof module != 'undefined' && module.exports) {
     module.exports = twttr.txt;
   }
 
+  if (typeof define == 'function' && define.amd) {
+    define([], twttr.txt);
+  }
+
   if (typeof window != 'undefined') {
     if (window.twttr) {
       for (var prop in twttr) {
         window.twttr[prop] = twttr[prop];
       }
     } else {
       window.twttr = twttr;
     }