Bug 525537 - Gloda search tokenizer should perform case-folding and accent-folding to be case-insensitive and accent-insensitive, also handle half-width katakana. review changes patch v1 on top of m_kato's v4 patch. r=m_kato
authorAndrew Sutherland <asutherland@asutherland.org>
Wed, 17 Mar 2010 19:31:23 -0700
changeset 5203 5d97bb8bf0ccf4246db53656f4d9fa3ea841552a
parent 5202 d67ca271fb2dc5a0aabf5df728d110760e2cce61
child 5204 7b60a657c20cd1055afe1b0c3308b1e224cab034
push idunknown
push userunknown
push dateunknown
reviewersm_kato
bugs525537
Bug 525537 - Gloda search tokenizer should perform case-folding and accent-folding to be case-insensitive and accent-insensitive, also handle half-width katakana. review changes patch v1 on top of m_kato's v4 patch. r=m_kato
mailnews/db/gloda/modules/datastore.js
mailnews/db/gloda/test/unit/test_intl.js
mailnews/extensions/fts3/data/README
mailnews/extensions/fts3/data/generate_table.py
mailnews/extensions/fts3/src/Normalize.c
mailnews/extensions/fts3/src/fts3_porter.c
--- a/mailnews/db/gloda/modules/datastore.js
+++ b/mailnews/db/gloda/modules/datastore.js
@@ -564,17 +564,17 @@ var GlodaDatastore = {
   kConstraintIn: 1,
   kConstraintRanges: 2,
   kConstraintEquals: 3,
   kConstraintStringLike: 4,
   kConstraintFulltext: 5,
 
   /* ******************* SCHEMA ******************* */
 
-  _schemaVersion: 19,
+  _schemaVersion: 20,
   _schema: {
     tables: {
 
       // ----- Messages
       folderLocations: {
         columns: [
           ["id", "INTEGER PRIMARY KEY"],
           ["folderURI", "TEXT NOT NULL"],
@@ -1105,30 +1105,35 @@ var GlodaDatastore = {
     // - all kinds of correctness changes (blow away)
     // version 17
     // - more correctness fixes. (blow away)
     // version 18
     // - significant empty set support (blow away)
     // version 19
     // - there was a typo that was resulting in deleted getting set to the
     //  numeric value of the javascript undefined value.  (migrate-able)
-
-    // 1) Blow away if it's old enough that we can't patch things up below.
-    // 2) Blow away if it's from the future.
-    if (aCurVersion < 18 || aCurVersion > aNewVersion) {
+    // version 20
+    // - tokenizer changes to provide for case/accent-folding. (blow away)
+
+    // If it's not the current version, we must blow it away.
+    if (aCurVersion != aNewVersion) {
       aDBConnection.close();
       aDBFile.remove(false);
       this._log.warn("Global database has been purged due to schema change.");
       return this._createDB(aDBService, aDBFile);
     }
 
+    // Let's leave this code around as a template for the next time we can
+    // migrate stuff.
+    /*
     this._log.warn("Global database performing schema update.");
     // version 19
     aDBConnection.executeSimpleSQL("UPDATE messages set deleted = 1 WHERE " +
                                    "deleted < 0 or deleted > 1");
+    */
 
     aDBConnection.schemaVersion = aNewVersion;
     return aDBConnection;
   },
 
   _outstandingAsyncStatements: [],
 
   _createAsyncStatement: function gloda_ds_createAsyncStatement(aSQLString,
--- a/mailnews/db/gloda/test/unit/test_intl.js
+++ b/mailnews/db/gloda/test/unit/test_intl.js
@@ -47,17 +47,17 @@
 load("resources/glodaTestHelper.js");
 
 /* ===== Tests ===== */
 
 /**
  * To make the encoding pairs:
  * - For the subject bit:
  *   import email
- *   h = email.header.Header(charset=CHARSET)
+ *   h = email.Header.Header(charset=CHARSET)
  *   h.append(STRING)
  *   h.encode()
  * - For the body bit
  *   s.encode(CHARSET)
  */
 var intlPhrases = [
   // -- CJK case
   {
@@ -137,16 +137,47 @@ var intlPhrases = [
     encodings: {
       "utf-8": ["=?UTF-8?B?0J3QvtCy0L7QtQ==?=",
                 "\xd0\x9d\xd0\xbe\xd0\xb2\xd0\xbe\xd0\xb5"]
     },
     searchPhrases: [
       {body: "\u043d\u043e\u0432\u043e\u0435", match: true},
     ]
   },
+  // case-folding happens after decomposition
+  {
+    name: "Awesome where A has a bar over it",
+    actual: "\u0100wesome",
+    encodings: {
+      "utf-8": ["=?utf-8?q?=C4=80wesome?=",
+                "\xc4\x80wesome"]
+    },
+    searchPhrases: [
+      {body: "\u0100wesome", match: true}, // upper A-bar
+      {body: "\u0101wesome", match: true}, // lower a-bar
+      {body: "Awesome", match: true}, // upper A
+      {body: "awesome", match: true}, // lower a
+    ]
+  },
+  // deep decomposition happens and after that, case folding
+  {
+    name: "Upper case upsilon with diaeresis and hook goes to small upsilon",
+    actual: "\u03d4esterday",
+    encodings: {
+      "utf-8": ["=?utf-8?q?=CF=94esterday?=",
+                "\xcf\x94esterday"]
+    },
+    searchPhrases: [
+      {body: "\u03d4esterday", match: true}, // Y_: 03d4 => 03d2 (decomposed)
+      {body: "\u03d3esterday", match: true}, // Y_' 03d3 => 03d2 (decomposed)
+      {body: "\u03d2esterday", match: true}, // Y_  03d2 => 03a5 (decomposed)
+      {body: "\u03a5esterday", match: true}, // Y   03a5 => 03c5 (lowercase)
+      {body: "\u03c5esterday", match: true}, // y   03c5 (final state)
+    ]
+  },
   // full-width alphabet
   // Even if search phrases are ASCII, it has to hit.
   {
     name: "Full-width Thunderbird",
     actual: "\uff34\uff48\uff55\uff4e\uff44\uff45\uff52\uff42\uff49\uff52\uff44",
     encodings: {
       "utf-8": ["=?UTF-8?B?77y0772I772V772O772E772F772S772C772J772S772E?=",
                 "\xef\xbc\xb4\xef\xbd\x88\xef\xbd\x95\xef\xbd\x8e\xef\xbd\x84\xef\xbd\x85\xef\xbd\x92\xef\xbd\x82\xef\xbd\x89\xef\xbd\x92\xef\xbd\x84"]
new file mode 100644
--- /dev/null
+++ b/mailnews/extensions/fts3/data/README
@@ -0,0 +1,5 @@
+The data files in this directory come from the ICU project:
+http://bugs.icu-project.org/trac/browser/icu/trunk/source/data/unidata/norm2
+
+They are intended to be consumed by the ICU project's gennorm2 script.  We have
+our own script that processes them.
--- a/mailnews/extensions/fts3/data/generate_table.py
+++ b/mailnews/extensions/fts3/data/generate_table.py
@@ -7,24 +7,25 @@
 # the License. You may obtain a copy of the License at
 # http://www.mozilla.org/MPL/
 #
 # Software distributed under the License is distributed on an "AS IS" basis,
 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 # for the specific language governing rights and limitations under the
 # License.
 #
-# The Original Code is the Calendar code
+# The Original Code is Mozilla Thunderbird.
 #
 # The Initial Developer of the Original Code is Mozilla Japan.
 # Portions created by the Initial Developer are Copyright (C) 2010
 # the Initial Developer. All Rights Reserved.
 #
 # Contributor(s):
 #   Makoto Kato <m_kato@ga2.so-net.ne.jp>
+#   Andrew Sutherland <asutherland@asutherland.org>
 #
 # Alternatively, the contents of this file may be used under the terms of
 # either the GNU General Public License Version 2 or later (the "GPL"), or
 # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 # in which case the provisions of the GPL or the LGPL are applicable instead
 # of those above. If you wish to allow use of your version of this file only
 # under the terms of either the GPL or the LGPL, and not to allow others to
 # use your version of this file under the terms of the MPL, indicate your
@@ -63,16 +64,17 @@ print '''/* -*- Mode: C++; tab-width: 4;
  * The Original Code is mozilla.org code.
  *
  * The Initial Developer of the Original Code is Mozilla Japan.
  * Portions created by the Initial Developer are Copyright (C) 2010
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   Makoto Kato <m_kato@ga2.so-net.ne.jp>
+ *   Andrew Sutherland <asutherland@asutherland.org>
  *
  * Alternatively, the contents of this file may be used under the terms of
  * either of the GNU General Public License Version 2 or later (the "GPL"),
  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  * in which case the provisions of the GPL or the LGPL are applicable instead
  * of those above. If you wish to allow use of your version of this file only
  * under the terms of either the GPL or the LGPL, and not to allow others to
  * use your version of this file under the terms of the MPL, indicate your
@@ -81,63 +83,133 @@ print '''/* -*- Mode: C++; tab-width: 4;
  * the provisions above, a recipient may use your version of this file under
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
 
 /* THIS FILE IS GENERATED BY generate_table.py.  DON'T EDIT THIS */
 '''
 
-p = re.compile('([0-9A-F]{4,5})[=\>]([0-9A-F]{4,5})')
+p = re.compile('([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?[=\>]([0-9A-F]{4,5})?')
+G_FROM = 1
+G_TO = 2
+G_FIRSTVAL = 3
+
+# Array whose value at index i is the unicode value unicode character i should
+# map to.
 array = []
+# Contents of gNormalizeTable.  We insert zero entries for sub-pages where we
+# have no mappings.  We insert references to the tables where we do have
+# such tables.
 globalTable = []
 globalTable.append("0")
+# The (exclusive) upper bound of the conversion table, unicode character-wise.
+# This is 0x10000 because our generated table is only 16-bit.  This also limits
+# the values we can map to; we perform an identity mapping for target values
+# that >= maxmapping.
 maxmapping = 0x10000
 sizePerTable = 64
 
-# loading case insensitive table
+# Map characters that the mapping tells us to obliterate to the NUKE_CHAR
+# (such lines look like "FFF0..FFF8>")
+# We do this because if we didn't do this, we would emit these characters as
+# part of a token, which we definitely don't want.
+NUKE_CHAR = 0x20
+
+# --- load case folding table
+# entries in the file look like:
+#  0041>0061
+#  02D8>0020 0306
+#  2000..200A>0020
+#
+# The 0041 (uppercase A) tells us it lowercases to 0061 (lowercase a).
+# The 02D8 is a "spacing clone[s] of diacritic" breve which gets decomposed into
+#  a space character and a breve.  This entry/type of entry also shows up in
+#  'nfkc.txt'.
+# The 2000..200A covers a range of space characters and maps them down to the
+#  'normal' space character.
 
 file = open('nfkc_cf.txt')
-line = file.readline()
-m = p.match(line)
+
+m = None
+line = "\n"
 i = 0x0
 while i < maxmapping and line:
-	if not m:
-		line = file.readline()
-		m = p.match(line)
+        if not m:
+                line = file.readline()
+                m = p.match(line)
+                if not m:
+                        continue
+                low = int(m.group(G_FROM), 16)
+                # if G_TO is present, use it, otherwise fallback to low
+                high = m.group(G_TO) and int(m.group(G_TO), 16) or low
+                # if G_FIRSTVAL is present use it, otherwise use NUKE_CHAR
+                val = (m.group(G_FIRSTVAL) and int(m.group(G_FIRSTVAL), 16)
+                                           or NUKE_CHAR)
 		continue
 
-	if i == int(m.group(1), 16):
-		if int(m.group(2), 16) >= maxmapping:
+
+        if i >= low and i <= high:
+		if val >= maxmapping:
 			array.append(i)
 		else:
-			array.append(int(m.group(2), 16))
-		m = None
+			array.append(val)
+                if i == high:
+                        m = None
 	else:
 		array.append(i)
 	i = i + 1
 file.close()
 
-# loading normalized table
-
+# --- load normalization / decomposition table
+# It is important that this file gets processed second because the other table
+# will tell us about mappings from uppercase U with diaeresis to lowercase u
+# with diaeresis.  We obviously don't want that clobbering our value.  (Although
+# this would work out if we propagated backwards rather than forwards...)
+#
+# - entries in this file that we care about look like:
+#  00A0>0020
+#  0100=0041 0304
+#
+# They are found in the "Canonical and compatibility decomposition mappings"
+# section.
+#
+# The 00A0 is mapping NBSP to the normal space character.
+# The 0100 (a capital A with a bar over top of) is equivalent to 0041 (capital
+#  A) plus a 0304 (combining overline).  We do not care about the combining
+#  marks which is why our regular expression does not capture it.
+#
+#
+# - entries that we do not care about look like:
+#  0300..0314:230
+#
+# These map marks to their canonical combining class which appears to be a way
+# of specifying the precedence / order in which marks should be combined.  The
+# key thing is we don't care about them.
 file = open('nfkc.txt')
 line = file.readline()
 m = p.match(line)
 while line:
 	if not m:
 		line = file.readline()
 		m = p.match(line)
 		continue
 
-	if m and (int(m.group(1), 16) < maxmapping) and (int(m.group(2), 16) < maxmapping):
-		array[int(m.group(1), 16)] = int(m.group(2), 16)
+        low = int(m.group(G_FROM), 16)
+        # if G_TO is present, use it, otherwise fallback to low
+        high = m.group(G_TO) and int(m.group(G_TO), 16) or low
+        # if G_FIRSTVAL is present use it, otherwise fall back to NUKE_CHAR
+        val = m.group(G_FIRSTVAL) and int(m.group(G_FIRSTVAL), 16) or NUKE_CHAR
+        for i in range(low, high+1):
+                if i < maxmapping and val < maxmapping:
+                        array[i] = val
 	m = None
 file.close()
 
-# generate a noamlzied table to support case insensitive and accent
+# --- generate a normalized table to support case and accent folding
 
 i = 0
 needTerm = False;
 while i < maxmapping:
 	if not i % sizePerTable:
 		# table is empty?
 		j = i
 		while j < i + sizePerTable:
@@ -152,22 +224,26 @@ while i < maxmapping:
 			continue
 
 		if needTerm:
 			print "};\n"
 		globalTable.append("gNormalizeTable%04x" % i)
 		print "static const unsigned short gNormalizeTable%04x[] = {\n\t" % i,
 		print "/* U+%04x */\n\t" % i,
 		needTerm = True
-	try:
-		c = array[array[i]]
-	except:
-		c = array[i]
+        # Decomposition does not case-fold, so we want to compensate by
+        # performing a lookup here.  Because decomposition chains can be
+        # example: 01d5, a capital U with a diaeresis and a bar. yes, really.
+        # 01d5 -> 00dc -> 0055 (U) -> 0075 (u)
+        c = array[i]
+        while c != array[c]:
+                c = array[c]
         if c >= 0x41 and c <= 0x5a:
-            c = c + 0x20
+                raise Exception('got an uppercase character somehow: %x => %x'
+                                % (i, c))
 	print "0x%04x," % c,
 	i = i + 1
 	if not i % 8:
 		print "\n\t",
 
 print "};\n\nstatic const unsigned short* gNormalizeTable[] = {",
 i = 0
 while i < (maxmapping / sizePerTable):
@@ -176,13 +252,13 @@ while i < (maxmapping / sizePerTable):
 	print globalTable[i] + ",", 
 	i += 1
 
 print '''
 };
 
 unsigned int normalize_character(const unsigned int c)
 {
-  if (c > 0x10000 || !gNormalizeTable[c >> 6])
+  if (c > ''' + ('0x%x' % (maxmapping,)) + ''' || !gNormalizeTable[c >> 6])
     return c;
   return gNormalizeTable[c >> 6][c & 0x3f];
 }
 '''
--- a/mailnews/extensions/fts3/src/Normalize.c
+++ b/mailnews/extensions/fts3/src/Normalize.c
@@ -15,16 +15,17 @@
  * The Original Code is mozilla.org code.
  *
  * The Initial Developer of the Original Code is Mozilla Japan.
  * Portions created by the Initial Developer are Copyright (C) 2010
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   Makoto Kato <m_kato@ga2.so-net.ne.jp>
+ *   Andrew Sutherland <asutherland@asutherland.org>
  *
  * Alternatively, the contents of this file may be used under the terms of
  * either of the GNU General Public License Version 2 or later (the "GPL"),
  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  * in which case the provisions of the GPL or the LGPL are applicable instead
  * of those above. If you wish to allow use of your version of this file only
  * under the terms of either the GPL or the LGPL, and not to allow others to
  * use your version of this file under the terms of the MPL, indicate your
@@ -51,17 +52,17 @@ static const unsigned short gNormalizeTa
 
 static const unsigned short gNormalizeTable0080[] = {
 	/* U+0080 */
 	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 
 	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 
 	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 
 	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 
 	0x0020, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 
-	0x0020, 0x00a9, 0x0061, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0020, 
+	0x0020, 0x00a9, 0x0061, 0x00ab, 0x00ac, 0x0020, 0x00ae, 0x0020, 
 	0x00b0, 0x00b1, 0x0032, 0x0033, 0x0020, 0x03bc, 0x00b6, 0x00b7, 
 	0x0020, 0x0031, 0x006f, 0x00bb, 0x0031, 0x0031, 0x0033, 0x00bf, 
 	};
 
 static const unsigned short gNormalizeTable00c0[] = {
 	/* U+00c0 */
 	0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x00e6, 0x0063, 
 	0x0065, 0x0065, 0x0065, 0x0065, 0x0069, 0x0069, 0x0069, 0x0069, 
@@ -167,17 +168,17 @@ static const unsigned short gNormalizeTa
 	0x02e8, 0x02e9, 0x02ea, 0x02eb, 0x02ec, 0x02ed, 0x02ee, 0x02ef, 
 	0x02f0, 0x02f1, 0x02f2, 0x02f3, 0x02f4, 0x02f5, 0x02f6, 0x02f7, 
 	0x02f8, 0x02f9, 0x02fa, 0x02fb, 0x02fc, 0x02fd, 0x02fe, 0x02ff, 
 	};
 
 static const unsigned short gNormalizeTable0340[] = {
 	/* U+0340 */
 	0x0300, 0x0301, 0x0342, 0x0313, 0x0308, 0x03b9, 0x0346, 0x0347, 
-	0x0348, 0x0349, 0x034a, 0x034b, 0x034c, 0x034d, 0x034e, 0x034f, 
+	0x0348, 0x0349, 0x034a, 0x034b, 0x034c, 0x034d, 0x034e, 0x0020, 
 	0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, 
 	0x0358, 0x0359, 0x035a, 0x035b, 0x035c, 0x035d, 0x035e, 0x035f, 
 	0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, 
 	0x0368, 0x0369, 0x036a, 0x036b, 0x036c, 0x036d, 0x036e, 0x036f, 
 	0x0371, 0x0371, 0x0373, 0x0373, 0x02b9, 0x0375, 0x0377, 0x0377, 
 	0x0378, 0x0379, 0x0020, 0x037b, 0x037c, 0x037d, 0x003b, 0x037f, 
 	};
 
@@ -192,17 +193,17 @@ static const unsigned short gNormalizeTa
 	0x03c5, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, 
 	0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, 
 	};
 
 static const unsigned short gNormalizeTable03c0[] = {
 	/* U+03c0 */
 	0x03c0, 0x03c1, 0x03c3, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, 
 	0x03c8, 0x03c9, 0x03b9, 0x03c5, 0x03bf, 0x03c5, 0x03c9, 0x03d7, 
-	0x03b2, 0x03b8, 0x03c5, 0x03a5, 0x03a5, 0x03c6, 0x03c0, 0x03d7, 
+	0x03b2, 0x03b8, 0x03c5, 0x03c5, 0x03c5, 0x03c6, 0x03c0, 0x03d7, 
 	0x03d9, 0x03d9, 0x03db, 0x03db, 0x03dd, 0x03dd, 0x03df, 0x03df, 
 	0x03e1, 0x03e1, 0x03e3, 0x03e3, 0x03e5, 0x03e5, 0x03e7, 0x03e7, 
 	0x03e9, 0x03e9, 0x03eb, 0x03eb, 0x03ed, 0x03ed, 0x03ef, 0x03ef, 
 	0x03ba, 0x03c1, 0x03c3, 0x03f3, 0x03b8, 0x03b5, 0x03f6, 0x03f8, 
 	0x03f8, 0x03c3, 0x03fb, 0x03fb, 0x03fc, 0x037b, 0x037c, 0x037d, 
 	};
 
 static const unsigned short gNormalizeTable0400[] = {
@@ -572,16 +573,52 @@ static const unsigned short gNormalizeTa
 	0x10d0, 0x10d1, 0x10d2, 0x10d3, 0x10d4, 0x10d5, 0x10d6, 0x10d7, 
 	0x10d8, 0x10d9, 0x10da, 0x10db, 0x10dc, 0x10dd, 0x10de, 0x10df, 
 	0x10e0, 0x10e1, 0x10e2, 0x10e3, 0x10e4, 0x10e5, 0x10e6, 0x10e7, 
 	0x10e8, 0x10e9, 0x10ea, 0x10eb, 0x10ec, 0x10ed, 0x10ee, 0x10ef, 
 	0x10f0, 0x10f1, 0x10f2, 0x10f3, 0x10f4, 0x10f5, 0x10f6, 0x10f7, 
 	0x10f8, 0x10f9, 0x10fa, 0x10fb, 0x10dc, 0x10fd, 0x10fe, 0x10ff, 
 	};
 
+static const unsigned short gNormalizeTable1140[] = {
+	/* U+1140 */
+	0x1140, 0x1141, 0x1142, 0x1143, 0x1144, 0x1145, 0x1146, 0x1147, 
+	0x1148, 0x1149, 0x114a, 0x114b, 0x114c, 0x114d, 0x114e, 0x114f, 
+	0x1150, 0x1151, 0x1152, 0x1153, 0x1154, 0x1155, 0x1156, 0x1157, 
+	0x1158, 0x1159, 0x115a, 0x115b, 0x115c, 0x115d, 0x115e, 0x0020, 
+	0x0020, 0x1161, 0x1162, 0x1163, 0x1164, 0x1165, 0x1166, 0x1167, 
+	0x1168, 0x1169, 0x116a, 0x116b, 0x116c, 0x116d, 0x116e, 0x116f, 
+	0x1170, 0x1171, 0x1172, 0x1173, 0x1174, 0x1175, 0x1176, 0x1177, 
+	0x1178, 0x1179, 0x117a, 0x117b, 0x117c, 0x117d, 0x117e, 0x117f, 
+	};
+
+static const unsigned short gNormalizeTable1780[] = {
+	/* U+1780 */
+	0x1780, 0x1781, 0x1782, 0x1783, 0x1784, 0x1785, 0x1786, 0x1787, 
+	0x1788, 0x1789, 0x178a, 0x178b, 0x178c, 0x178d, 0x178e, 0x178f, 
+	0x1790, 0x1791, 0x1792, 0x1793, 0x1794, 0x1795, 0x1796, 0x1797, 
+	0x1798, 0x1799, 0x179a, 0x179b, 0x179c, 0x179d, 0x179e, 0x179f, 
+	0x17a0, 0x17a1, 0x17a2, 0x17a3, 0x17a4, 0x17a5, 0x17a6, 0x17a7, 
+	0x17a8, 0x17a9, 0x17aa, 0x17ab, 0x17ac, 0x17ad, 0x17ae, 0x17af, 
+	0x17b0, 0x17b1, 0x17b2, 0x17b3, 0x0020, 0x0020, 0x17b6, 0x17b7, 
+	0x17b8, 0x17b9, 0x17ba, 0x17bb, 0x17bc, 0x17bd, 0x17be, 0x17bf, 
+	};
+
+static const unsigned short gNormalizeTable1800[] = {
+	/* U+1800 */
+	0x1800, 0x1801, 0x1802, 0x1803, 0x1804, 0x1805, 0x1806, 0x1807, 
+	0x1808, 0x1809, 0x180a, 0x0020, 0x0020, 0x0020, 0x180e, 0x180f, 
+	0x1810, 0x1811, 0x1812, 0x1813, 0x1814, 0x1815, 0x1816, 0x1817, 
+	0x1818, 0x1819, 0x181a, 0x181b, 0x181c, 0x181d, 0x181e, 0x181f, 
+	0x1820, 0x1821, 0x1822, 0x1823, 0x1824, 0x1825, 0x1826, 0x1827, 
+	0x1828, 0x1829, 0x182a, 0x182b, 0x182c, 0x182d, 0x182e, 0x182f, 
+	0x1830, 0x1831, 0x1832, 0x1833, 0x1834, 0x1835, 0x1836, 0x1837, 
+	0x1838, 0x1839, 0x183a, 0x183b, 0x183c, 0x183d, 0x183e, 0x183f, 
+	};
+
 static const unsigned short gNormalizeTable1b00[] = {
 	/* U+1b00 */
 	0x1b00, 0x1b01, 0x1b02, 0x1b03, 0x1b04, 0x1b05, 0x1b05, 0x1b07, 
 	0x1b07, 0x1b09, 0x1b09, 0x1b0b, 0x1b0b, 0x1b0d, 0x1b0d, 0x1b0f, 
 	0x1b10, 0x1b11, 0x1b11, 0x1b13, 0x1b14, 0x1b15, 0x1b16, 0x1b17, 
 	0x1b18, 0x1b19, 0x1b1a, 0x1b1b, 0x1b1c, 0x1b1d, 0x1b1e, 0x1b1f, 
 	0x1b20, 0x1b21, 0x1b22, 0x1b23, 0x1b24, 0x1b25, 0x1b26, 0x1b27, 
 	0x1b28, 0x1b29, 0x1b2a, 0x1b2b, 0x1b2c, 0x1b2d, 0x1b2e, 0x1b2f, 
@@ -683,81 +720,81 @@ static const unsigned short gNormalizeTa
 	0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 
 	0x0075, 0x0075, 0x0079, 0x0079, 0x0079, 0x0079, 0x0079, 0x0079, 
 	0x0079, 0x0079, 0x1efb, 0x1efb, 0x1efd, 0x1efd, 0x1eff, 0x1eff, 
 	};
 
 static const unsigned short gNormalizeTable1f00[] = {
 	/* U+1f00 */
 	0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 
-	0x03b1, 0x03b1, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 
+	0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 
 	0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x1f16, 0x1f17, 
-	0x03b5, 0x03b5, 0x0395, 0x0395, 0x0395, 0x0395, 0x1f1e, 0x1f1f, 
+	0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x1f1e, 0x1f1f, 
 	0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 
-	0x03b7, 0x03b7, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 
+	0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 
 	0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 
-	0x03b9, 0x03b9, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 
+	0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 
 	};
 
 static const unsigned short gNormalizeTable1f40[] = {
 	/* U+1f40 */
 	0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x1f46, 0x1f47, 
-	0x03bf, 0x03bf, 0x039f, 0x039f, 0x039f, 0x039f, 0x1f4e, 0x1f4f, 
+	0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x1f4e, 0x1f4f, 
 	0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c5, 
-	0x1f58, 0x03c5, 0x1f5a, 0x03a5, 0x1f5c, 0x03a5, 0x1f5e, 0x03a5, 
+	0x1f58, 0x03c5, 0x1f5a, 0x03c5, 0x1f5c, 0x03c5, 0x1f5e, 0x03c5, 
 	0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 
-	0x03c9, 0x03c9, 0x03a9, 0x03a9, 0x03a9, 0x03a9, 0x03a9, 0x03a9, 
+	0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 
 	0x03b1, 0x03b1, 0x03b5, 0x03b5, 0x03b7, 0x03b7, 0x03b9, 0x03b9, 
 	0x03bf, 0x03bf, 0x03c5, 0x03c5, 0x03c9, 0x03c9, 0x1f7e, 0x1f7f, 
 	};
 
 static const unsigned short gNormalizeTable1f80[] = {
 	/* U+1f80 */
-	0x03b1, 0x03b1, 0x1f00, 0x1f01, 0x1f00, 0x1f01, 0x1f00, 0x1f01, 
-	0x0391, 0x0391, 0x1f08, 0x1f09, 0x1f08, 0x1f09, 0x1f08, 0x1f09, 
-	0x03b7, 0x03b7, 0x1f20, 0x1f21, 0x1f20, 0x1f21, 0x1f20, 0x1f21, 
-	0x0397, 0x0397, 0x1f28, 0x1f29, 0x1f28, 0x1f29, 0x1f28, 0x1f29, 
-	0x03c9, 0x03c9, 0x1f60, 0x1f61, 0x1f60, 0x1f61, 0x1f60, 0x1f61, 
-	0x03a9, 0x03a9, 0x1f68, 0x1f69, 0x1f68, 0x1f69, 0x1f68, 0x1f69, 
+	0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 
+	0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 
+	0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 
+	0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 
+	0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 
+	0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 
 	0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x1fb5, 0x03b1, 0x03b1, 
-	0x03b1, 0x03b1, 0x03b1, 0x0391, 0x03b1, 0x0020, 0x03b9, 0x0020, 
+	0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x0020, 0x03b9, 0x0020, 
 	};
 
 static const unsigned short gNormalizeTable1fc0[] = {
 	/* U+1fc0 */
 	0x0020, 0x0020, 0x03b7, 0x03b7, 0x03b7, 0x1fc5, 0x03b7, 0x03b7, 
-	0x03b5, 0x0395, 0x03b7, 0x0397, 0x03b7, 0x0020, 0x0020, 0x0020, 
-	0x03b9, 0x03b9, 0x03b9, 0x03ca, 0x1fd4, 0x1fd5, 0x03b9, 0x03b9, 
-	0x03b9, 0x03b9, 0x03b9, 0x0399, 0x1fdc, 0x0020, 0x0020, 0x0020, 
-	0x03c5, 0x03c5, 0x03c5, 0x03cb, 0x03c1, 0x03c1, 0x03c5, 0x03c5, 
-	0x03c5, 0x03c5, 0x03c5, 0x03a5, 0x03c1, 0x0020, 0x00a8, 0x0060, 
+	0x03b5, 0x03b5, 0x03b7, 0x03b7, 0x03b7, 0x0020, 0x0020, 0x0020, 
+	0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x1fd4, 0x1fd5, 0x03b9, 0x03b9, 
+	0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x1fdc, 0x0020, 0x0020, 0x0020, 
+	0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c1, 0x03c1, 0x03c5, 0x03c5, 
+	0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c1, 0x0020, 0x0020, 0x0060, 
 	0x1ff0, 0x1ff1, 0x03c9, 0x03c9, 0x03c9, 0x1ff5, 0x03c9, 0x03c9, 
-	0x03bf, 0x039f, 0x03c9, 0x03a9, 0x03c9, 0x0020, 0x0020, 0x1fff, 
+	0x03bf, 0x03bf, 0x03c9, 0x03c9, 0x03c9, 0x0020, 0x0020, 0x1fff, 
 	};
 
 static const unsigned short gNormalizeTable2000[] = {
 	/* U+2000 */
 	0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 
-	0x0020, 0x0020, 0x0020, 0x200b, 0x200c, 0x200d, 0x200e, 0x200f, 
+	0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 
 	0x2010, 0x2010, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x0020, 
 	0x2018, 0x2019, 0x201a, 0x201b, 0x201c, 0x201d, 0x201e, 0x201f, 
 	0x2020, 0x2021, 0x2022, 0x2023, 0x002e, 0x002e, 0x002e, 0x2027, 
-	0x2028, 0x2029, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e, 0x0020, 
+	0x2028, 0x2029, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 
 	0x2030, 0x2031, 0x2032, 0x2032, 0x2032, 0x2035, 0x2035, 0x2035, 
 	0x2038, 0x2039, 0x203a, 0x203b, 0x0021, 0x203d, 0x0020, 0x203f, 
 	};
 
 static const unsigned short gNormalizeTable2040[] = {
 	/* U+2040 */
 	0x2040, 0x2041, 0x2042, 0x2043, 0x2044, 0x2045, 0x2046, 0x003f, 
 	0x003f, 0x0021, 0x204a, 0x204b, 0x204c, 0x204d, 0x204e, 0x204f, 
 	0x2050, 0x2051, 0x2052, 0x2053, 0x2054, 0x2055, 0x2056, 0x2032, 
 	0x2058, 0x2059, 0x205a, 0x205b, 0x205c, 0x205d, 0x205e, 0x0020, 
-	0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x2065, 0x2066, 0x2067, 
-	0x2068, 0x2069, 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 
+	0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 
+	0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 
 	0x0030, 0x0069, 0x2072, 0x2073, 0x0034, 0x0035, 0x0036, 0x0037, 
 	0x0038, 0x0039, 0x002b, 0x2212, 0x003d, 0x0028, 0x0029, 0x006e, 
 	};
 
 static const unsigned short gNormalizeTable2080[] = {
 	/* U+2080 */
 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 
 	0x0038, 0x0039, 0x002b, 0x2212, 0x003d, 0x0028, 0x0029, 0x208f, 
@@ -1142,17 +1179,17 @@ static const unsigned short gNormalizeTa
 	};
 
 static const unsigned short gNormalizeTable3140[] = {
 	/* U+3140 */
 	0x111a, 0x1106, 0x1107, 0x1108, 0x1121, 0x1109, 0x110a, 0x110b, 
 	0x110c, 0x110d, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112, 0x1161, 
 	0x1162, 0x1163, 0x1164, 0x1165, 0x1166, 0x1167, 0x1168, 0x1169, 
 	0x116a, 0x116b, 0x116c, 0x116d, 0x116e, 0x116f, 0x1170, 0x1171, 
-	0x1172, 0x1173, 0x1174, 0x1175, 0x1160, 0x1114, 0x1115, 0x11c7, 
+	0x1172, 0x1173, 0x1174, 0x1175, 0x0020, 0x1114, 0x1115, 0x11c7, 
 	0x11c8, 0x11cc, 0x11ce, 0x11d3, 0x11d7, 0x11d9, 0x111c, 0x11dd, 
 	0x11df, 0x111d, 0x111e, 0x1120, 0x1122, 0x1123, 0x1127, 0x1129, 
 	0x112b, 0x112c, 0x112d, 0x112e, 0x112f, 0x1132, 0x1136, 0x1140, 
 	};
 
 static const unsigned short gNormalizeTable3180[] = {
 	/* U+3180 */
 	0x1147, 0x114c, 0x11f1, 0x11f2, 0x1157, 0x1158, 0x1159, 0x1184, 
@@ -1558,18 +1595,18 @@ static const unsigned short gNormalizeTa
 	0xfde0, 0xfde1, 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 
 	0xfde8, 0xfde9, 0xfdea, 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef, 
 	0x0635, 0x0642, 0x0627, 0x0627, 0x0645, 0x0635, 0x0631, 0x0639, 
 	0x0648, 0x0635, 0x0635, 0x062c, 0x0631, 0xfdfd, 0xfdfe, 0xfdff, 
 	};
 
 static const unsigned short gNormalizeTablefe00[] = {
 	/* U+fe00 */
-	0xfe00, 0xfe01, 0xfe02, 0xfe03, 0xfe04, 0xfe05, 0xfe06, 0xfe07, 
-	0xfe08, 0xfe09, 0xfe0a, 0xfe0b, 0xfe0c, 0xfe0d, 0xfe0e, 0xfe0f, 
+	0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 
+	0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 
 	0x002c, 0x3001, 0x3002, 0x003a, 0x003b, 0x0021, 0x003f, 0x3016, 
 	0x3017, 0x002e, 0xfe1a, 0xfe1b, 0xfe1c, 0xfe1d, 0xfe1e, 0xfe1f, 
 	0xfe20, 0xfe21, 0xfe22, 0xfe23, 0xfe24, 0xfe25, 0xfe26, 0xfe27, 
 	0xfe28, 0xfe29, 0xfe2a, 0xfe2b, 0xfe2c, 0xfe2d, 0xfe2e, 0xfe2f, 
 	0x002e, 0x2014, 0x2013, 0x005f, 0x005f, 0x0028, 0x0029, 0x007b, 
 	0x007d, 0x3014, 0x3015, 0x3010, 0x3011, 0x300a, 0x300b, 0x3008, 
 	};
 
@@ -1601,17 +1638,17 @@ static const unsigned short gNormalizeTa
 	/* U+fec0 */
 	0x0636, 0x0637, 0x0637, 0x0637, 0x0637, 0x0638, 0x0638, 0x0638, 
 	0x0638, 0x0639, 0x0639, 0x0639, 0x0639, 0x063a, 0x063a, 0x063a, 
 	0x063a, 0x0641, 0x0641, 0x0641, 0x0641, 0x0642, 0x0642, 0x0642, 
 	0x0642, 0x0643, 0x0643, 0x0643, 0x0643, 0x0644, 0x0644, 0x0644, 
 	0x0644, 0x0645, 0x0645, 0x0645, 0x0645, 0x0646, 0x0646, 0x0646, 
 	0x0646, 0x0647, 0x0647, 0x0647, 0x0647, 0x0648, 0x0648, 0x0649, 
 	0x0649, 0x064a, 0x064a, 0x064a, 0x064a, 0x0644, 0x0644, 0x0644, 
-	0x0644, 0x0644, 0x0644, 0x0644, 0x0644, 0xfefd, 0xfefe, 0xfeff, 
+	0x0644, 0x0644, 0x0644, 0x0644, 0x0644, 0xfefd, 0xfefe, 0x0020, 
 	};
 
 static const unsigned short gNormalizeTableff00[] = {
 	/* U+ff00 */
 	0xff00, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 
 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 
 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 
 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 
@@ -1634,32 +1671,32 @@ static const unsigned short gNormalizeTa
 	};
 
 static const unsigned short gNormalizeTableff80[] = {
 	/* U+ff80 */
 	0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 
 	0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 
 	0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 
 	0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309a, 
-	0x1160, 0x1100, 0x1101, 0x11aa, 0x1102, 0x11ac, 0x11ad, 0x1103, 
+	0x0020, 0x1100, 0x1101, 0x11aa, 0x1102, 0x11ac, 0x11ad, 0x1103, 
 	0x1104, 0x1105, 0x11b0, 0x11b1, 0x11b2, 0x11b3, 0x11b4, 0x11b5, 
 	0x111a, 0x1106, 0x1107, 0x1108, 0x1121, 0x1109, 0x110a, 0x110b, 
 	0x110c, 0x110d, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112, 0xffbf, 
 	};
 
 static const unsigned short gNormalizeTableffc0[] = {
 	/* U+ffc0 */
 	0xffc0, 0xffc1, 0x1161, 0x1162, 0x1163, 0x1164, 0x1165, 0x1166, 
 	0xffc8, 0xffc9, 0x1167, 0x1168, 0x1169, 0x116a, 0x116b, 0x116c, 
 	0xffd0, 0xffd1, 0x116d, 0x116e, 0x116f, 0x1170, 0x1171, 0x1172, 
 	0xffd8, 0xffd9, 0x1173, 0x1174, 0x1175, 0xffdd, 0xffde, 0xffdf, 
 	0x00a2, 0x00a3, 0x00ac, 0x0020, 0x00a6, 0x00a5, 0x20a9, 0xffe7, 
 	0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25a0, 0x25cb, 0xffef, 
-	0xfff0, 0xfff1, 0xfff2, 0xfff3, 0xfff4, 0xfff5, 0xfff6, 0xfff7, 
-	0xfff8, 0xfff9, 0xfffa, 0xfffb, 0xfffc, 0xfffd, 0xfffe, 0xffff, 
+	0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 
+	0x0020, 0xfff9, 0xfffa, 0xfffb, 0xfffc, 0xfffd, 0xfffe, 0xffff, 
 	};
 
 static const unsigned short* gNormalizeTable[] = { 
 	0, gNormalizeTable0040, gNormalizeTable0080, gNormalizeTable00c0, 
 	gNormalizeTable0100, gNormalizeTable0140, gNormalizeTable0180, gNormalizeTable01c0, 
 	gNormalizeTable0200, gNormalizeTable0240, gNormalizeTable0280, gNormalizeTable02c0, 
 	0, gNormalizeTable0340, gNormalizeTable0380, gNormalizeTable03c0, 
 	gNormalizeTable0400, gNormalizeTable0440, gNormalizeTable0480, gNormalizeTable04c0, 
@@ -1670,24 +1707,24 @@ static const unsigned short* gNormalizeT
 	gNormalizeTable0900, gNormalizeTable0940, 0, gNormalizeTable09c0, 
 	gNormalizeTable0a00, gNormalizeTable0a40, 0, 0, 
 	0, gNormalizeTable0b40, gNormalizeTable0b80, gNormalizeTable0bc0, 
 	0, gNormalizeTable0c40, 0, gNormalizeTable0cc0, 
 	0, gNormalizeTable0d40, 0, gNormalizeTable0dc0, 
 	gNormalizeTable0e00, 0, gNormalizeTable0e80, gNormalizeTable0ec0, 
 	gNormalizeTable0f00, gNormalizeTable0f40, gNormalizeTable0f80, 0, 
 	gNormalizeTable1000, 0, gNormalizeTable1080, gNormalizeTable10c0, 
+	0, gNormalizeTable1140, 0, 0, 
 	0, 0, 0, 0, 
 	0, 0, 0, 0, 
 	0, 0, 0, 0, 
 	0, 0, 0, 0, 
 	0, 0, 0, 0, 
-	0, 0, 0, 0, 
-	0, 0, 0, 0, 
-	0, 0, 0, 0, 
+	0, 0, gNormalizeTable1780, 0, 
+	gNormalizeTable1800, 0, 0, 0, 
 	0, 0, 0, 0, 
 	0, 0, 0, 0, 
 	gNormalizeTable1b00, gNormalizeTable1b40, 0, 0, 
 	0, 0, 0, 0, 
 	gNormalizeTable1d00, gNormalizeTable1d40, gNormalizeTable1d80, 0, 
 	gNormalizeTable1e00, gNormalizeTable1e40, gNormalizeTable1e80, gNormalizeTable1ec0, 
 	gNormalizeTable1f00, gNormalizeTable1f40, gNormalizeTable1f80, gNormalizeTable1fc0, 
 	gNormalizeTable2000, gNormalizeTable2040, gNormalizeTable2080, 0, 
--- a/mailnews/extensions/fts3/src/fts3_porter.c
+++ b/mailnews/extensions/fts3/src/fts3_porter.c
@@ -78,16 +78,25 @@ static const unsigned char sqlite3Utf8Tr
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
 };
 
 typedef unsigned char u8;
 
+/**
+ * SQLite helper macro from sqlite3.c (really utf.c) to encode a unicode
+ * character into utf8.
+ *
+ * @param zOut A pointer to the current write position that is updated by
+ *     the routine.  At entry it should point to one-past the last valid
+ *     encoded byte.  The same holds true at exit.
+ * @param c The character to encode; this should be an unsigned int.
+ */
 #define WRITE_UTF8(zOut, c) {                          \
   if( c<0x0080 ){                                      \
     *zOut++ = (u8)(c&0xff);                            \
   }                                                    \
   else if( c<0x0800 ){                                 \
     *zOut++ = 0xC0 + (u8)((c>>6) & 0x1F);              \
     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
   }                                                    \
@@ -99,16 +108,31 @@ typedef unsigned char u8;
     *zOut++ = 0xf0 + (u8)((c>>18) & 0x07);             \
     *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
   }                                                    \
 }
 
 /**
+ * Fudge factor to avoid buffer overwrites when WRITE_UTF8 is involved.
+ *
+ * Our normalization table includes entries that may result in a larger
+ *  utf-8 encoding.  Namely, 023a maps to 2c65.  This is a growth from 2 bytes
+ *  as utf-8 encoded to 3 bytes.  This is currently the only transition possible
+ *  because 1-byte encodings are known to stay 1-byte and our normalization
+ *  table is 16-bit and so can't generate a 4-byte encoded output.
+ *
+ * For simplicity, we just multiple by 2 which covers the current case and
+ *  potential growth for 2-byte to 4-byte growth.  We can afford to do this
+ *  because we're not talking about a lot of memory here as a rule.
+ */
+#define MAX_UTF8_GROWTH_FACTOR 2
+
+/**
  * Helper from sqlite3.c to read a single UTF8 character.
  *
  * The clever bit with multi-byte reading is that you keep going until you find
  *  a byte whose top bits are not '10'.  A single-byte UTF8 character will have
  *  '00' or '01', and a multi-byte UTF8 character must start with '11'.
  *
  * In the event of illegal UTF-8 this macro may read an arbitrary number of
  *  characters but will never read past zTerm.  The resulting character value
@@ -116,27 +140,28 @@ typedef unsigned char u8;
  *  illegal character (0xfffd) for UTF-16 surrogates.
  *
  * @param zIn A pointer to the current position that is updated by the routine,
  *     pointing at the start of the next character when the routine returns.
  * @param zTerm A pointer one past the end of the buffer.
  * @param c The 'unsigned int' to hold the resulting character value.  Do not
  *      use a short or a char.
  */
-#define READ_UTF8(zIn, zTerm, c)                           \
+#define READ_UTF8(zIn, zTerm, c) {                         \
   c = *(zIn++);                                            \
   if( c>=0xc0 ){                                           \
     c = sqlite3Utf8Trans1[c-0xc0];                         \
     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
       c = (c<<6) + (0x3f & *(zIn++));                      \
     }                                                      \
     if( c<0x80                                             \
         || (c&0xFFFFF800)==0xD800                          \
         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
-  }
+  }                                                        \
+}
 
 /* end of compatible block to complie codes */
 
 /*
 ** Class derived from sqlite3_tokenizer
 */
 typedef struct porter_tokenizer {
   sqlite3_tokenizer base;      /* Base class */
@@ -146,17 +171,17 @@ typedef struct porter_tokenizer {
 ** Class derived from sqlit3_tokenizer_cursor
 */
 typedef struct porter_tokenizer_cursor {
   sqlite3_tokenizer_cursor base;
   const char *zInput;          /* input we are tokenizing */
   int nInput;                  /* size of the input */
   int iOffset;                 /* current position in zInput */
   int iToken;                  /* index of next token to be returned */
-  char *zToken;                /* storage for current token */
+  unsigned char *zToken;       /* storage for current token */
   int nAllocated;              /* space allocated to zToken buffer */
   /**
    * Store the offset of the second character in the bi-gram pair that we just
    *  emitted so that we can consider it being the first character in a bi-gram
    *  pair.
    * The value 0 indicates that there is no previous such character.  This is
    *  an acceptable sentinel value because the 0th offset can never be the
    *  offset of the second in a bi-gram pair.
@@ -402,65 +427,112 @@ static int stem(
   if( xCond && !xCond(z) ) return 1;
   while( *zTo ){
     *(--z) = *(zTo++);
   }
   *pz = z;
   return 1;
 }
 
+/**
+ * Voiced sound mark is only on Japanese.  It is like accent.  It combines with
+ * previous character.  Example, "サ" (Katakana) with "゛" (voiced sound mark) is
+ * "ザ".  Although full-width character mapping has combined character like "ザ",
+ * there is no combined character on half-width Katanaka character mapping.
+ */
 static int isVoicedSoundMark(const unsigned int c)
 {
   if (c == 0xff9e || c == 0xff9f || c == 0x3099 || c == 0x309a)
     return 1;
   return 0;
 }
 
-/*
-** This is the fallback stemmer used when the porter stemmer is
-** inappropriate.  The input word is copied into the output with
-** US-ASCII case folding.  If the input word is too long (more
-** than 20 bytes if it contains no digits or more than 6 bytes if
-** it contains digits) then word is truncated to 20 or 6 bytes
-** by taking 10 or 3 bytes from the beginning and end.
-**
-** Note:
-** This is UTF-8 safe-ish.  If truncation occurs bytes can get mashed together
-** in ways that produce illegal UTF-8.  The resulting mashed-up byte strings
-** will not be directly exposed to the user and will not do anything dangerous,
-** but could result in confusing behavior if queries using wildcard support
-** ("*") are involved.  This is sufficiently harmless that we're not going to
-** worry about it for now.
-*/
-static void copy_stemmer(const unsigned char *zIn, const int nIn, unsigned char *zOut, int *pnOut){
-  int i, mx;
-  int hasDigit = 0;
-  const unsigned char *zInTerm = zIn + nIn;
+/**
+ * How many unicode characters to take from the front and back of a term in
+ * |copy_stemmer|.
+ */
+#define COPY_STEMMER_COPY_HALF_LEN 10
+
+/**
+ * Normalizing but non-stemming term copying.
+ *
+ * The original function would take 10 bytes from the front and 10 bytes from
+ * the back if there were no digits in the string and it was more than 20
+ * bytes long.  If there were digits involved that would decrease to 3 bytes
+ * from the front and 3 from the back.  This would potentially corrupt utf-8
+ * encoded characters, which is fine from the perspective of the FTS3 logic.
+ *
+ * In our revised form we now operate on a unicode character basis rather than
+ * a byte basis.  Additionally we use the same length limit even if there are
+ * digits involved because it's not clear digit token-space reduction is saving
+ * us from anything and could be hurting.  Specifically, if no one is ever
+ * going to search on things with digits, then we should just remove them.
+ * Right now, the space reduction is going to increase false positives when
+ * people do search on them and increase the number of collisions sufficiently
+ * to make it really expensive.  The caveat is there will be some increase in
+ * index size which could be meaningful if people are receiving lots of emails
+ * full of distinct numbers.
+ *
+ * In order to do the copy-from-the-front and copy-from-the-back trick, once
+ * we reach N characters in, we set zFrontEnd to the current value of zOut
+ * (which represents the termination of the first part of the result string)
+ * and set zBackStart to the value of zOutStart.  We then advanced zBackStart
+ * along a character at a time as we write more characters.  Once we have
+ * traversed the entire string, if zBackStart > zFrontEnd, then we know
+ * the string should be shrunk using the characters in the two ranges.
+ *
+ * (It would be faster to scan from the back with specialized logic but that
+ * particular logic seems easy to screw up and we don't have unit tests in here
+ * to the extent required.)
+ *
+ * @param zIn Input string to normalize and potentially shrink.
+ * @param nBytesIn The number of bytes in zIn, distinct from the number of
+ *     unicode characters encoded in zIn.
+ * @param zOut The string to write our output into.  This must have at least
+ *     nBytesIn * MAX_UTF8_GROWTH_FACTOR in order to compensate for
+ *     normalization that results in a larger utf-8 encoding.
+ * @param pnBytesOut Integer to write the number of bytes in zOut into.
+ */
+static void copy_stemmer(const unsigned char *zIn, const int nBytesIn,
+                         unsigned char *zOut, int *pnBytesOut){
+  const unsigned char *zInTerm = zIn + nBytesIn;
   unsigned char *zOutStart = zOut;
   unsigned int c;
+  unsigned int charCount = 0;
+  unsigned char *zFrontEnd = NULL, *zBackStart = NULL;
+  unsigned int trashC;
 
   /* copy normalized character */
   while (zIn < zInTerm) {
     READ_UTF8(zIn, zInTerm, c);
     c = normalize_character(c);
+
     /* ignore voiced/semi-voiced sound mark */
     if (!isVoicedSoundMark(c)) {
-      if( c>='0' && c<='9' ) hasDigit = 1;
+      /* advance one non-voiced sound mark character. */
+      if (zBackStart)
+        READ_UTF8(zBackStart, zOut, trashC);
+
       WRITE_UTF8(zOut, c);
+      charCount++;
+      if (charCount == COPY_STEMMER_COPY_HALF_LEN) {
+        zFrontEnd = zOut;
+        zBackStart = zOutStart;
+      }
     }
   }
-  mx = hasDigit ? 3 : 10;
-  if ((zOut - zOutStart) > mx * 2) {
-    zOut = zOutStart + mx;
-    for (i = nIn - mx; i < nIn; i++) {
-      *zOut++ = zOutStart[i];
-    }
+
+  /* if we need to shrink the string, transplant the back bytes */
+  if (zBackStart > zFrontEnd) { /* this handles when both are null too */
+    size_t backBytes = zOut - zBackStart;
+    memmove(zFrontEnd, zBackStart, backBytes);
+    zOut = zFrontEnd + backBytes;
   }
   *zOut = 0;
-  *pnOut = zOut - zOutStart;
+  *pnBytesOut = zOut - zOutStart;
 }
 
 
 /*
 ** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
 ** zOut is at least big enough to hold nIn bytes.  Write the actual
 ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
 **
@@ -477,17 +549,17 @@ static void copy_stemmer(const unsigned 
 ** If the input word contains not digits but does characters not 
 ** in [a-zA-Z] then no stemming is attempted and this routine just 
 ** copies the input into the input into the output with US-ASCII
 ** case folding.
 **
 ** Stemming never increases the length of the word.  So there is
 ** no chance of overflowing the zOut buffer.
 */
-static void porter_stemmer(const unsigned char *zIn, unsigned int nIn, char *zOut, int *pnOut){
+static void porter_stemmer(const unsigned char *zIn, unsigned int nIn, unsigned char *zOut, int *pnOut){
   unsigned int i, j, c;
   char zReverse[28];
   char *z, *z2;
   const unsigned char *zTerm = zIn + nIn;
   unsigned char *zTmp = zIn;
 
   if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
     /* The word is too big or too small for the porter stemmer.
@@ -858,20 +930,20 @@ static int isDelim(
  *    characters.  We run the porter stemmer algorithm against the word.
  *    Because we have no way to know what is and is not an English word
  *    (the only language for which the porter stemmer was designed), this
  *    could theoretically map multiple words that are not variations of the
  *    same word down to the same root, resulting in potentially unexpected
  *    result inclusions in the search results.  We accept this result because
  *    there's not a lot we can do about it and false positives are much
  *    better than false negatives.
- * - A copied token; ASCII case-folded but not stemmed.  We call the porter
+ * - A copied token; case/accent-folded but not stemmed.  We call the porter
  *    stemmer for all non-CJK cases and it diverts to the copy stemmer if it
- *    sees any non-ASCII characters or the string is too long.  The copy
- *    stemmer will truncate the string if it is deemed too long.
+ *    sees any non-ASCII characters (after folding) or if the string is too
+ *    long.  The copy stemmer will shrink the string if it is deemed too long.
  * - A bi-gram token; two CJK-ish characters.  For query reasons we generate a
  *    series of overlapping bi-grams.  (We can't require the user to start their
  *    search based on the arbitrary context of the indexed documents.)
  *
  * It may be useful to think of this function as operating at the points between
  *  characters.  While we are considering the 'current' character (the one after
  *  the 'point'), we are also interested in the 'previous' character (the one
  *  preceding the point).
@@ -1009,18 +1081,18 @@ static int porterNext(
         // wildcard case:
         (numChars == 1 && iStartOffset == 0 &&
          (c->iOffset >= 3) &&
          (c->iOffset == c->nInput - 1) &&
          (z[c->iOffset] == '*'))) {
       /* figure out the number of bytes to copy/stem */
       int n = c->iOffset - iStartOffset;
       /* make sure there is enough buffer space */
-      if (n > c->nAllocated) {
-        c->nAllocated = n + 20;
+      if (n * MAX_UTF8_GROWTH_FACTOR > c->nAllocated) {
+        c->nAllocated = n * MAX_UTF8_GROWTH_FACTOR + 20;
         c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
         if (c->zToken == NULL)
           return SQLITE_NOMEM;
       }
 
       if (state == BIGRAM_USE) {
         /* This is by bigram. So it is unnecessary to convert word */
         copy_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);