Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc
authorroc+@cs.cmu.edu
Wed, 18 Jul 2007 20:26:51 -0700
changeset 3658 fd5ac8b40ee15f03852a8b2381710f6824085614
parent 3657 db528dca8a1e04d84fa8774c25881bc7b317c7f2
child 3659 a6b86ac5125996226807e45b2e134bd34bfb9e97
push idunknown
push userunknown
push dateunknown
bugs336959
milestone1.9a7pre
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc
content/base/public/nsLineBreaker.h
intl/build/Makefile.in
intl/lwbrk/public/nsILineBreaker.h
intl/lwbrk/src/Makefile.in
intl/lwbrk/src/jisx4501class.h
intl/lwbrk/src/nsComplexBreaker.h
intl/lwbrk/src/nsJISx4501LineBreaker.cpp
intl/lwbrk/src/nsJISx4501LineBreaker.h
intl/lwbrk/src/nsPangoBreaker.cpp
intl/lwbrk/src/nsRuleBreaker.cpp
intl/lwbrk/tools/anzx4501.html
intl/lwbrk/tools/anzx4501.pl
intl/lwbrk/tools/jisx4501class.txt
intl/lwbrk/tools/jisx4501simp.txt
--- a/content/base/public/nsLineBreaker.h
+++ b/content/base/public/nsLineBreaker.h
@@ -91,21 +91,22 @@ public:
     return !((0x0030 <= u && u <= 0x0039) ||
              (0x0041 <= u && u <= 0x005A) ||
              (0x0061 <= u && u <= 0x007A));
   }
 
   static inline PRBool IsComplexChar(PRUnichar u)
   {
     return IsComplexASCIIChar(u) ||
-           (0x1100 <= u && u <= 0x11ff) ||
-           (0x2000 <= u && u <= 0x21ff) ||
-           (0x2e80 <= u && u <= 0xd7ff) ||
-           (0xf900 <= u && u <= 0xfaff) ||
-           (0xff00 <= u && u <= 0xffef);
+           (0x0e01 <= u && u <= 0x0edf) || // Thai & Lao
+           (0x1100 <= u && u <= 0x11ff) || // Hangul Jamo
+           (0x2000 <= u && u <= 0x21ff) || // Punctuations and Symbols
+           (0x2e80 <= u && u <= 0xd7ff) || // several CJK blocks
+           (0xf900 <= u && u <= 0xfaff) || // CJK Compatibility Idographs
+           (0xff00 <= u && u <= 0xffef);   // Halfwidth and Fullwidth Forms
   }
 
   // Normally, break opportunities exist at the end of each run of whitespace
   // (see IsSpace above). Break opportunities can also exist inside runs of
   // non-whitespace, as determined by nsILineBreaker. We pass a whitespace-
   // delimited word to nsILineBreaker if it contains at least one character
   // matching IsComplexChar.
   // We provide flags to control on a per-chunk basis where breaks are allowed.
--- a/intl/build/Makefile.in
+++ b/intl/build/Makefile.in
@@ -101,8 +101,18 @@ EXTRA_DSO_LDOPTS = \
 ifneq (,$(filter mac cocoa,$(MOZ_WIDGET_TOOLKIT)))
 EXTRA_DSO_LDOPTS += \
         $(TK_LIBS) \
         $(NULL)
 endif
 
 include $(topsrcdir)/config/rules.mk
 
+ifdef MOZ_ENABLE_PANGO
+CXXFLAGS	+= \
+		$(MOZ_PANGO_CFLAGS) \
+		$(NULL)
+
+EXTRA_DSO_LDOPTS += \
+		$(MOZ_PANGO_LIBS) \
+		$(NULL)
+endif
+
--- a/intl/lwbrk/public/nsILineBreaker.h
+++ b/intl/lwbrk/public/nsILineBreaker.h
@@ -38,29 +38,25 @@
 #define nsILineBreaker_h__
 
 #include "nsISupports.h"
 
 #include "nscore.h"
 
 #define NS_LINEBREAKER_NEED_MORE_TEXT -1
 
-// {C9C5938E-70EF-4db2-ADEE-E7B2CCFBBEE6}
+// {5ae68851-d9a3-49fd-9388-58586dad8044}
 #define NS_ILINEBREAKER_IID \
-{ 0xc9c5938e, 0x70ef, 0x4db2, \
-    { 0xad, 0xee, 0xe7, 0xb2, 0xcc, 0xfb, 0xbe, 0xe6 } }
+{ 0x5ae68851, 0xd9a3, 0x49fd, \
+    { 0x93, 0x88, 0x58, 0x58, 0x6d, 0xad, 0x80, 0x44 } }
 
 class nsILineBreaker : public nsISupports
 {
 public:
   NS_DECLARE_STATIC_IID_ACCESSOR(NS_ILINEBREAKER_IID)
-  virtual PRBool BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1,
-                                 const PRUnichar* aText2 , 
-                                 PRUint32 aTextLen2) = 0;
-
   virtual PRInt32 Next( const PRUnichar* aText, PRUint32 aLen, 
                         PRUint32 aPos) = 0;
 
   virtual PRInt32 Prev( const PRUnichar* aText, PRUint32 aLen, 
                         PRUint32 aPos) = 0;
 
   // Call this on a word with whitespace at either end. We will apply JISx4501
   // rules to find breaks inside the word. aBreakBefore is set to the break-
--- a/intl/lwbrk/src/Makefile.in
+++ b/intl/lwbrk/src/Makefile.in
@@ -47,17 +47,38 @@ LIBRARY_NAME	= lwbrk_s
 FORCE_STATIC_LIB = 1
 LIBXUL_LIBRARY  = 1
 
 REQUIRES	= xpcom \
 		  string \
 		  unicharutil \
 		  $(NULL)
 
-CSRCS		= rulebrk.c
-
 CPPSRCS		= \
 		nsJISx4501LineBreaker.cpp \
 		nsSampleWordBreaker.cpp \
 		nsSemanticUnitScanner.cpp \
 		$(NULL)
 
+ifdef MOZ_ENABLE_PANGO
+CPPSRCS		+= \
+		nsPangoBreaker.cpp \
+		$(NULL)
+else
+CPPSRCS		+= \
+		nsRuleBreaker.cpp \
+		$(NULL)
+
+CSRCS		= rulebrk.c
+endif
+
 include $(topsrcdir)/config/rules.mk
+
+ifdef MOZ_ENABLE_PANGO
+CXXFLAGS	+= \
+		$(MOZ_PANGO_CFLAGS) \
+		$(NULL)
+
+EXTRA_DSO_LDOPTS += \
+		$(MOZ_PANGO_LIBS) \
+		$(NULL)
+endif
+
--- a/intl/lwbrk/src/jisx4501class.h
+++ b/intl/lwbrk/src/jisx4501class.h
@@ -173,8 +173,43 @@ 0x55555555, // U+30C8 - U+30CF
 0x55555555, // U+30D0 - U+30D7
 0x55555555, // U+30D8 - U+30DF
 0x15151555, // U+30E0 - U+30E7
 0x51555555, // U+30E8 - U+30EF
 0x51155555, // U+30F0 - U+30F7
 0x51111555, // U+30F8 - U+30FF
 };
 
+static const PRUint32 gLBClass0E[32] = {
+0x99999999, // U+0E00 - U+0E07
+0x99999999, // U+0E08 - U+0E0F
+0x99999999, // U+0E10 - U+0E17
+0x99999999, // U+0E18 - U+0E1F
+0x99999999, // U+0E20 - U+0E27
+0x19999999, // U+0E28 - U+0E2F
+0x99999999, // U+0E30 - U+0E37
+0x09999999, // U+0E38 - U+0E3F
+0x91999999, // U+0E40 - U+0E47
+0x89999999, // U+0E48 - U+0E4F
+0x66666666, // U+0E50 - U+0E57
+0x99991166, // U+0E58 - U+0E5F
+0x99999999, // U+0E60 - U+0E67
+0x99999999, // U+0E68 - U+0E6F
+0x99999999, // U+0E70 - U+0E77
+0x99999999, // U+0E78 - U+0E7F
+0x99999999, // U+0E80 - U+0E87
+0x99999999, // U+0E88 - U+0E8F
+0x99999999, // U+0E90 - U+0E97
+0x99999999, // U+0E98 - U+0E9F
+0x99999999, // U+0EA0 - U+0EA7
+0x19999999, // U+0EA8 - U+0EAF
+0x99999999, // U+0EB0 - U+0EB7
+0x99999999, // U+0EB8 - U+0EBF
+0x91999999, // U+0EC0 - U+0EC7
+0x99999999, // U+0EC8 - U+0ECF
+0x66666666, // U+0ED0 - U+0ED7
+0x99999966, // U+0ED8 - U+0EDF
+0x99999999, // U+0EE0 - U+0EE7
+0x99999999, // U+0EE8 - U+0EEF
+0x99999999, // U+0EF0 - U+0EF7
+0x99999999, // U+0EF8 - U+0EFF
+};
+
new file mode 100644
--- /dev/null
+++ b/intl/lwbrk/src/nsComplexBreaker.h
@@ -0,0 +1,52 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is
+ * Theppitak Karoonboonyanan <thep@linux.thai.net>.
+ * Portions created by the Initial Developer are Copyright (C) 2007
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * - Theppitak Karoonboonyanan <thep@linux.thai.net>
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either of the GNU General Public License Version 2 or later (the "GPL"),
+ * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+#ifndef nsComplexBreaker_h__
+#define nsComplexBreaker_h__
+
+#include "nsString.h"
+
+/**
+ * Find line break opportunities in aText[] of aLength characters,
+ * filling boolean values indicating line break opportunities for
+ * corresponding charactersin aBreakBefore[] on return.
+ */
+void
+NS_GetComplexLineBreaks(const PRUnichar* aText, PRUint32 aLength,
+                        PRPackedBool* aBreakBefore);
+
+#endif  /* nsComplexBreaker_h__ */
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
@@ -37,19 +37,18 @@
 
 
 
 #include "nsJISx4501LineBreaker.h"
 
 #include "pratom.h"
 #include "nsLWBRKDll.h"
 #include "jisx4501class.h"
-#define TH_UNICODE
-#include "th_char.h"
-#include "rulebrk.h"
+#include "nsComplexBreaker.h"
+#include "nsTArray.h"
 #include "nsUnicharUtils.h"
 
 /* 
 
    Simplification of Pair Table in JIS X 4051
 
    1. The Origion Table - in 4.1.3
 
@@ -136,91 +135,92 @@
         9        X                                   
       [b]        X                                  
        15        X        X     X     X    
        16        X                 X  X    
        18        X              X  X  X    
 
 
 
-   4. We add THAI characters and make it breakable w/ all ther class
+   4. We add COMPLEX characters and make it breakable w/ all ther class
+      except after class 1 and before class [a]
 
    Class of
    Leading    Class of Trailing Char Class
    Char        
 
-              1 [a] 7  8  9 [b]15 16 18 THAI
+              1 [a] 7  8  9 [b]15 16 18 COMPLEX
                                      
-        1     X  X  X  X  X  X  X  X  X
+        1     X  X  X  X  X  X  X  X  X  X
       [a]        X                             
         7        X  X                      
         8        X              X    
         9        X                                   
       [b]        X                                  
        15        X        X     X     X    
        16        X                 X  X    
        18        X              X  X  X    
-     THAI                                T
+  COMPLEX        X                       T
       
      T : need special handling
 
    5. Now we use one bit to encode weather it is breakable, and use 2 bytes
       for one row, then the bit table will look like:
 
                  18    <-   1
             
-       1  0000 0001 1111 1111  = 0x01FF
+       1  0000 0011 1111 1111  = 0x03FF
       [a] 0000 0000 0000 0010  = 0x0002
        7  0000 0000 0000 0110  = 0x0006
        8  0000 0000 0100 0010  = 0x0042
        9  0000 0000 0000 0010  = 0x0002
       [b] 0000 0000 0000 0010  = 0x0002
       15  0000 0001 0101 0010  = 0x0152
       16  0000 0001 1000 0010  = 0x0182
       18  0000 0001 1100 0010  = 0x01C2
-    THAI  0000 0000 0000 0000  = 0x0000
+ COMPLEX  0000 0010 0000 0010  = 0x0202
 
    5. Now we map the class to number
       
       0: 1 
       1: [a]- 2, 3, 4, 5, 6
       2: 7
       3: 8
       4: 9
       5: [b]- 10, 11, 12, 17
       6: 15
       7: 16
       8: 18
-      9: THAI
+      9: COMPLEX
 
 */
 
 #define MAX_CLASSES 10
 
 static const PRUint16 gPair[MAX_CLASSES] = {
-  0x01FF, 
+  0x03FF, 
   0x0002, 
   0x0006, 
   0x0042, 
   0x0002, 
   0x0002, 
   0x0152, 
   0x0182, 
   0x01C2,
-  0x0000
+  0x0202
 };
 
 
 static inline int
 GETCLASSFROMTABLE(const PRUint32* t, PRUint16 l)
 {
   return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
 }
 
-#define CLASS_THAI 9
+#define CLASS_COMPLEX 9
 
 
 
 static inline int
 IS_HALFWIDTH_IN_JISx4051_CLASS3(PRUnichar u)
 {
   return ((0xff66 <= (u)) && ((u) <= 0xff70));
 }
@@ -230,35 +230,41 @@ IS_CJK_CHAR(PRUnichar u)
 {
   return ((0x1100 <= (u) && (u) <= 0x11ff) ||
           (0x2e80 <= (u) && (u) <= 0xd7ff) ||
           (0xf900 <= (u) && (u) <= 0xfaff) ||
           (0xff00 <= (u) && (u) <= 0xffef) );
 }
 
 static inline int
+IS_COMPLEX(PRUnichar u)
+{
+  return (0x0e01 <= (u) && (u) <= 0x0e5b);
+}
+
+static inline int
 IS_SPACE(PRUnichar u)
 {
   return ((u) == 0x0020 || (u) == 0x0009 || (u) == 0x000a || (u) == 0x000d || (u)==0x200b);
 }
 
 static PRInt8 GetClass(PRUnichar u)
 {
    PRUint16 h = u & 0xFF00;
    PRUint16 l = u & 0x00ff;
    PRInt8 c;
    
    // Handle 3 range table first
    if( 0x0000 == h)
    {
      c = GETCLASSFROMTABLE(gLBClass00, l);
    } 
-   else if(th_isthai(u))
+   else if( 0x0E00 == h)
    {
-     c = CLASS_THAI;
+     c = GETCLASSFROMTABLE(gLBClass0E, l);
    }
    else if( 0x2000 == h)
    {
      c = GETCLASSFROMTABLE(gLBClass20, l);
    } 
    else if( 0x2100 == h)
    {
      c = GETCLASSFROMTABLE(gLBClass21, l);
@@ -407,185 +413,74 @@ static PRInt8 ContextualAnalysis(
      // somehow people use this as ' in "it's" sometimes...
      if(U_SPACE != next)
        return CHARACTER_CLASS;
    }
    return GetClass(cur);
 }
 
 
-PRBool nsJISx4051LineBreaker::BreakInBetween(
-  const PRUnichar* aText1 , PRUint32 aTextLen1,
-  const PRUnichar* aText2 , PRUint32 aTextLen2)
+PRInt32 nsJISx4051LineBreaker::WordMove(
+  const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos, PRInt8 aDirection)
 {
-  if(!aText1 || !aText2 || (0 == aTextLen1) || (0==aTextLen2) ||
-     NS_IS_HIGH_SURROGATE(aText1[aTextLen1-1]) && 
-     NS_IS_LOW_SURROGATE(aText2[0]) )  //Do not separate a surrogate pair
-  {
-     return PR_FALSE;
-  }
-
-  //search for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRInt32 cur;
+  PRBool  textNeedsJISx4051 = PR_FALSE;
+  PRInt32 begin, end;
 
-  for (cur= aTextLen1-1; cur>=0; cur--)
-  {
-    if (IS_SPACE(aText1[cur]))
-      break;
-    if (IS_CJK_CHAR(aText1[cur]))
-      goto ROUTE_CJK_BETWEEN;
+  for (begin = aPos; begin > 0 && !IS_SPACE(aText[begin - 1]); --begin) {
+    if (IS_CJK_CHAR(aText[begin]) || IS_COMPLEX(aText[begin])) {
+      textNeedsJISx4051 = PR_TRUE;
+    }
   }
-
-  for (cur= 0; cur < (PRInt32)aTextLen2; cur++)
-  {
-    if (IS_SPACE(aText2[cur]))
-      break;
-    if (IS_CJK_CHAR(aText2[cur]))
-      goto ROUTE_CJK_BETWEEN;
+  for (end = aPos + 1; end < PRInt32(aLen) && !IS_SPACE(aText[end]); ++end) {
+    if (IS_CJK_CHAR(aText[end]) || IS_COMPLEX(aText[end])) {
+      textNeedsJISx4051 = PR_TRUE;
+    }
   }
 
-  //now apply western rule.
-  return IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]);
-
-ROUTE_CJK_BETWEEN:
-
-  PRInt8 c1, c2;
-  if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1]))
-    c1 = ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:U_NULL,
-                                  aText1[aTextLen1-1],
-                                  aText2[0]);
-  else 
-    c1 = GetClass(aText1[aTextLen1-1]);
+  PRInt32 ret;
+  nsAutoTArray<PRPackedBool, 2000> breakState;
+  if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
+    // No complex text character, do not try to do complex line break.
+    // (This is required for serializers. See Bug #344816.)
+    // Also fall back to this when out of memory.
+    if (aDirection < 0) {
+      ret = (begin == PRInt32(aPos)) ? begin - 1 : begin;
+    } else {
+      ret = end;
+    }
+  } else {
+    GetJISx4051Breaks(aText + begin, end - begin, breakState.Elements());
 
-  if(NEED_CONTEXTUAL_ANALYSIS(aText2[0]))
-    c2 = ContextualAnalysis(aText1[aTextLen1-1],
-                            aText2[0],
-                            (aTextLen2>1)?aText2[1]:U_NULL);
-  else 
-    c2 = GetClass(aText2[0]);
+    ret = aPos;
+    do {
+      ret += aDirection;
+    } while (begin < ret && ret < end && !breakState[ret - begin]);
+  }
 
-  /* Handle cases for THAI */
-  if((CLASS_THAI == c1) && (CLASS_THAI == c2))
-  {
-     return (0 == TrbWordBreakPos(aText1, aTextLen1, aText2, aTextLen2));
-  }
-  else 
-  {
-     return GetPair(c1,c2);
-  }
+  return ret;
 }
 
-
 PRInt32 nsJISx4051LineBreaker::Next(
   const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos) 
 {
   NS_ASSERTION(aText, "aText shouldn't be null");
   NS_ASSERTION(aLen > aPos, "Illegal value (length > position)");
 
-  //forward check for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRUint32 cur;
-  for (cur = aPos; cur < aLen; ++cur)
-  {
-    if (IS_SPACE(aText[cur]))
-      return cur;
-    if (IS_CJK_CHAR(aText[cur]))
-      goto ROUTE_CJK_NEXT;
-  }
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
-
-ROUTE_CJK_NEXT:
-  PRInt8 c1, c2;
-  cur = aPos;
-  if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
-  {
-    c1 = ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
-                            aText[cur],
-                            (cur<(aLen-1)) ?aText[cur+1]:U_NULL);
-  } else  {
-    c1 = GetClass(aText[cur]);
-  }
-  
-  if(CLASS_THAI == c1) 
-     return PRUint32(TrbFollowing(aText, aLen, aPos));
-
-  for(cur++; cur <aLen; cur++)
-  {
-     if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
-     {
-       c2 = ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
-                               aText[cur],
-                               (cur<(aLen-1)) ?aText[cur+1]:U_NULL);
-     } else {
-       c2 = GetClass(aText[cur]);
-     }
-
-     if(GetPair(c1, c2)) {
-       return cur;
-     }
-     c1 = c2;
-  }
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
+  PRInt32 nextPos = WordMove(aText, aLen, aPos, 1);
+  return nextPos < PRInt32(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
 }
 
 PRInt32 nsJISx4051LineBreaker::Prev( 
   const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos) 
 {
   NS_ASSERTION(aText, "aText shouldn't be null");
-
-  //backward check for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRUint32 cur;
-  for (cur = aPos - 1; cur > 0; --cur)
-  {
-    if (IS_SPACE(aText[cur]))
-    {
-      if (cur != aPos - 1) // XXXldb Why?
-        ++cur;
-      return cur;
-    }
-    if (IS_CJK_CHAR(aText[cur]))
-      goto ROUTE_CJK_PREV;
-  }
-
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
+  NS_ASSERTION(aLen >= aPos, "Illegal value (length >= position)");
 
-ROUTE_CJK_PREV:
-  cur = aPos;
-  PRInt8 c1, c2;
-  if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
-  {
-    c2 = ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
-                            aText[cur-1],
-                            (cur<aLen) ?aText[cur]:U_NULL);
-  } else  {
-    c2 = GetClass(aText[cur-1]);
-  }
-  // To Do: 
-  //
-  // Should handle CLASS_THAI here
-  //
-  for(cur--; cur > 0; cur--)
-  {
-     if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
-     {
-       c1 = ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
-                               aText[cur-1],
-                               (cur<aLen) ?aText[cur]:U_NULL);
-     } else {
-       c1 = GetClass(aText[cur-1]);
-     }
-
-     if(GetPair(c1, c2)) {
-       return cur;
-     }
-     c2 = c1;
-  }
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
+  PRInt32 prevPos = WordMove(aText, aLen, aPos, -1);
+  return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
 }
 
 void
 nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUnichar* aChars, PRUint32 aLength,
                                          PRPackedBool* aBreakBefore)
 {
   PRUint32 cur;
   PRInt8 lastClass = -1;
@@ -599,26 +494,39 @@ nsJISx4051LineBreaker::GetJISx4051Breaks
                               ch,
                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL);
     } else {
       cl = GetClass(ch);
     }
 
     PRBool allowBreak;
     if (cur > 0) {
-      if (CLASS_THAI == lastClass && CLASS_THAI == cl) {
-        allowBreak = 0 == TrbWordBreakPos(aChars, cur, aChars + cur, aLength - cur);
-      } else {
-        allowBreak = GetPair(lastClass, cl);
-      }
+      NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
+                   "Loop should have prevented adjacent complex chars here");
+      allowBreak = GetPair(lastClass, cl);
     } else {
       allowBreak = PR_FALSE;
     }
     aBreakBefore[cur] = allowBreak;
     lastClass = cl;
+    if (CLASS_COMPLEX == cl) {
+      PRUint32 end = cur + 1;
+
+      while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
+        ++end;
+      }
+
+      NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
+
+      // restore breakability at chunk begin, which was always set to false
+      // by the complex line breaker
+      aBreakBefore[cur] = allowBreak;
+
+      cur = end - 1;
+    }
   }
 }
 
 void
 nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUint8* aChars, PRUint32 aLength,
                                          PRPackedBool* aBreakBefore)
 {
   PRUint32 cur;
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.h
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.h
@@ -43,22 +43,23 @@
 class nsJISx4051LineBreaker : public nsILineBreaker
 {
   NS_DECL_ISUPPORTS
 
 public:
   nsJISx4051LineBreaker();
   virtual ~nsJISx4051LineBreaker();
 
-  PRBool BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1,
-                         const PRUnichar* aText2 , PRUint32 aTextLen2);
-
   PRInt32 Next( const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos);
 
   PRInt32 Prev( const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos);
 
   virtual void GetJISx4051Breaks(const PRUnichar* aText, PRUint32 aLength,
                                  PRPackedBool* aBreakBefore);
   virtual void GetJISx4051Breaks(const PRUint8* aText, PRUint32 aLength,
                                  PRPackedBool* aBreakBefore);
+
+private:
+  PRInt32 WordMove(const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos,
+                   PRInt8 aDirection);
 };
 
 #endif  /* nsJISx4501LineBreaker_h__ */
new file mode 100644
--- /dev/null
+++ b/intl/lwbrk/src/nsPangoBreaker.cpp
@@ -0,0 +1,95 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is
+ * Theppitak Karoonboonyanan <thep@linux.thai.net>.
+ * Portions created by the Initial Developer are Copyright (C) 2007
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * - Theppitak Karoonboonyanan <thep@linux.thai.net>
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either of the GNU General Public License Version 2 or later (the "GPL"),
+ * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "nsComplexBreaker.h"
+
+#include <pango/pango-break.h>
+#include "nsUTF8Utils.h"
+#include "nsString.h"
+#include "nsTArray.h"
+
+void
+NS_GetComplexLineBreaks(const PRUnichar* aText, PRUint32 aLength,
+                        PRPackedBool* aBreakBefore)
+{
+  NS_ASSERTION(aText, "aText shouldn't be null");
+
+  nsAutoTArray<PangoLogAttr, 2000> attrBuffer;
+  if (!attrBuffer.AppendElements(aLength + 1))
+  {
+    // out of memory, behave as if there were no complex line breaker
+    for (PRUint32 i = 0; i < aLength; ++i) {
+      aBreakBefore[i] = PR_FALSE;
+    }
+  }
+
+  NS_ConvertUTF16toUTF8 aUTF8(aText, aLength);
+
+  const gchar* p = aUTF8.Data();
+  const gchar* end = p + aUTF8.Length();
+  PRUint32     u16Offset = 0;
+
+  static PangoLanguage* language = pango_language_from_string("en");
+
+  while (p < end)
+  {
+    PangoLogAttr* attr = attrBuffer.Elements();
+    pango_get_log_attrs(p, end - p, -1, language, attr, attrBuffer.Length());
+
+    while (p < end)
+    {
+      aBreakBefore[u16Offset] = attr->is_line_break;
+      if (NS_IS_LOW_SURROGATE(aText[u16Offset]))
+        aBreakBefore[++u16Offset] = PR_FALSE; // Skip high surrogate
+      ++u16Offset;
+
+      PRUint32 ch = UTF8CharEnumerator::NextChar(&p, end);
+      ++attr;
+
+      if (ch == 0) {
+        // pango_break (pango 1.16.2) only analyses text before the
+        // first NUL (but sets one extra attr). Workaround loop to call
+        // pango_break again to analyse after the NUL is done somewhere else
+        // (gfx/thebes/src/gfxPangoFonts.cpp: SetupClusterBoundaries()).
+        // So, we do the same here for pango_get_log_attrs.
+        break;
+      }
+    }
+  }
+}
+
new file mode 100644
--- /dev/null
+++ b/intl/lwbrk/src/nsRuleBreaker.cpp
@@ -0,0 +1,53 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is
+ * Theppitak Karoonboonyanan <thep@linux.thai.net>.
+ * Portions created by the Initial Developer are Copyright (C) 2007
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * - Theppitak Karoonboonyanan <thep@linux.thai.net>
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either of the GNU General Public License Version 2 or later (the "GPL"),
+ * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "nsComplexBreaker.h"
+
+#define TH_UNICODE
+#include "rulebrk.h"
+
+void
+NS_GetComplexLineBreaks(const PRUnichar* aText, PRUint32 aLength,
+                        PRPackedBool* aBreakBefore)
+{
+  NS_ASSERTION(aText, "aText shouldn't be null");
+
+  for (PRUint32 i = 0; i < aLength; i++)
+    aBreakBefore[i] = (0 == TrbWordBreakPos(aText, i, aText + i, aLength - i));
+}
+
--- a/intl/lwbrk/tools/anzx4501.html
+++ b/intl/lwbrk/tools/anzx4501.html
@@ -49,19 +49,19 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD BGCOLOR=yellow>Zs</TD>
 </TR>
 <TR><TH>00_1<TH>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD>14</TD>
-<TD>2</TD>
+<TD>3</TD>
 <TD></TD>
-<TD BGCOLOR=white>16</TD>
+<TD BGCOLOR=white>17</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
@@ -74,54 +74,54 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD>1</TD>
 <TD>13</TD>
-<TD>1</TD>
+<TD>2</TD>
 <TD></TD>
 <TD>1</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 </TR>
 <TR><TH>01_[a]<TH>
 <TD></TD>
-<TD>27</TD>
+<TD>31</TD>
 <TD>2</TD>
 <TD></TD>
-<TD>30</TD>
+<TD>32</TD>
 <TD>6</TD>
 <TD></TD>
-<TD BGCOLOR=white>65</TD>
+<TD BGCOLOR=white>71</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
-<TD>5</TD>
-<TD>22</TD>
+<TD>7</TD>
+<TD>24</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD>2</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD>2</TD>
 <TD>14</TD>
 <TD></TD>
 <TD></TD>
-<TD>14</TD>
+<TD>16</TD>
 <TD></TD>
 <TD></TD>
 <TD>2</TD>
 <TD>3</TD>
 <TD>1</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
@@ -281,34 +281,34 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD></TD>
 <TD></TD>
 <TD>13</TD>
 </TR>
 <TR><TH>06_15<TH>
 <TD></TD>
 <TD></TD>
 <TD></TD>
-<TD>10</TD>
+<TD>30</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
-<TD BGCOLOR=white>10</TD>
+<TD BGCOLOR=white>30</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
-<TD>10</TD>
+<TD>30</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
@@ -357,54 +357,54 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 </TR>
 <TR><TH>08_18<TH>
 <TD>10</TD>
-<TD>660</TD>
+<TD>659</TD>
 <TD>4</TD>
 <TD>130</TD>
-<TD>55</TD>
-<TD>940</TD>
+<TD>56</TD>
+<TD>941</TD>
 <TD>2</TD>
-<TD BGCOLOR=white>1801</TD>
+<TD BGCOLOR=white>1802</TD>
 <TD></TD>
 <TD>10</TD>
 <TD></TD>
 <TD></TD>
-<TD>367</TD>
+<TD>368</TD>
 <TD>1</TD>
-<TD>5</TD>
+<TD>4</TD>
 <TD></TD>
-<TD>287</TD>
+<TD>286</TD>
 <TD></TD>
 <TD></TD>
 <TD>4</TD>
 <TD></TD>
 <TD>3</TD>
 <TD>127</TD>
 <TD>3</TD>
 <TD>5</TD>
 <TD>3</TD>
 <TD>4</TD>
 <TD>6</TD>
-<TD>29</TD>
+<TD>30</TD>
 <TD>5</TD>
 <TD>12</TD>
 <TD>10</TD>
 <TD>273</TD>
-<TD>645</TD>
+<TD>646</TD>
 <TD>1</TD>
 <TD>1</TD>
 <TD></TD>
 </TR>
-<TR><TH>09_nbsp<TH>
+<TR><TH>09_COMPLEX<TH>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD BGCOLOR=white>0</TD>
@@ -484,17 +484,17 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD BGCOLOR=red>01_[a]</TD>
 <TD BGCOLOR=red>02_7</TD>
 <TD BGCOLOR=red>03_8</TD>
 <TD BGCOLOR=red>04_9</TD>
 <TD BGCOLOR=red>05_[b]</TD>
 <TD BGCOLOR=red>06_15</TD>
 <TD BGCOLOR=red>07_16</TD>
 <TD BGCOLOR=red>08_18</TD>
-<TD BGCOLOR=red>09_nbsp</TD>
+<TD BGCOLOR=red>09_COMPLEX</TD>
 <TD BGCOLOR=red>X</TD>
 </TR>
 <TR><TH>00<TH>
 <TD>6</TD>
 <TD>14</TD>
 <TD></TD>
 <TD>2</TD>
 <TD>1</TD>
@@ -552,16 +552,29 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD>226</TD>
 <TD></TD>
 <TD></TD>
 </TR>
+<TR><TH>0E<TH>
+<TD>1</TD>
+<TD>6</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>20</TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+</TR>
 <TR><TH>20<TH>
 <TD></TD>
 <TD>5</TD>
 <TD>1</TD>
 <TD></TD>
 <TD>4</TD>
 <TD>13</TD>
 <TD></TD>
--- a/intl/lwbrk/tools/anzx4501.pl
+++ b/intl/lwbrk/tools/anzx4501.pl
@@ -395,16 +395,17 @@ printf "[%s || %s]\n", $r, $def;
       printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7);
    }
    print HEADER "};\n\n";
 }
 printarray("00", "8");
 printarray("20", "8");
 printarray("21", "8");
 printarray("30", "5");
+printarray("0E", "9");
 
 #print %rangecount;
 
 ######################################################################
 #
 # Close files
 #
 ######################################################################
--- a/intl/lwbrk/tools/jisx4501class.txt
+++ b/intl/lwbrk/tools/jisx4501class.txt
@@ -185,8 +185,17 @@ 2729;274B;18
 274D;;18
 274F;2752;18
 2756;;18
 2758;275E;18
 2761;2767;18
 2776;2794;18
 2798;27AF;18
 27B1;27BE;18
+0E3F;;1
+0E2F;;4
+0E46;;4
+0E5A;0E5B;4
+0E50;0E59;15
+0E4F;;18
+0EAF;;4
+0EC6;;4
+0ED0;0ED9;15
--- a/intl/lwbrk/tools/jisx4501simp.txt
+++ b/intl/lwbrk/tools/jisx4501simp.txt
@@ -13,9 +13,9 @@ 12;05_[b]
 13;X
 14;X
 15;06_15
 16;07_16
 17;05_[b]
 18;08_18
 19;X
 20;X
-21;09_nbsp
+21;09_COMPLEX