Bug 450088 Line breaking regression (in Chinese and other languages) p=Jonathan Kew, r=masayuki, sr=roc
authorMasayuki Nakano <masayuki@d-toybox.com>
Tue, 09 Dec 2008 15:41:42 +0900
changeset 22539 556a93a5ffa60b493e03401acf1bbc9dca7077e6
parent 22538 957a4fed14af1edfccedb05131ac3385f0d84881
child 22541 ab68014ec016201b71283ef61bfaae982f4cf642
push idunknown
push userunknown
push dateunknown
reviewersmasayuki, roc
bugs450088
milestone1.9.2a1pre
Bug 450088 Line breaking regression (in Chinese and other languages) p=Jonathan Kew, r=masayuki, sr=roc
intl/lwbrk/src/jisx4501class.h
intl/lwbrk/src/nsJISx4501LineBreaker.cpp
intl/lwbrk/tools/anzx4501.html
intl/lwbrk/tools/jisx4501class.txt
layout/reftests/line-breaking/quotationmarks-cjk-1-ref.html
layout/reftests/line-breaking/quotationmarks-cjk-1.html
layout/reftests/line-breaking/reftest.list
--- a/intl/lwbrk/src/jisx4501class.h
+++ b/intl/lwbrk/src/jisx4501class.h
@@ -55,34 +55,34 @@ 0x77777777, // U+0060 - U+0067
 0x77777777, // U+0068 - U+006F
 0x77777777, // U+0070 - U+0077
 0x7AAA9777, // U+0078 - U+007F
 0x77777777, // U+0080 - U+0087
 0x77777777, // U+0088 - U+008F
 0x77777777, // U+0090 - U+0097
 0x77777777, // U+0098 - U+009F
 0xAA9A9AAB, // U+00A0 - U+00A7
-0x77A9A77A, // U+00A8 - U+00AF
+0x77A9777A, // U+00A8 - U+00AF
 0xAAAAAAAA, // U+00B0 - U+00B7
 0xAAAAAAAA, // U+00B8 - U+00BF
 0x77777777, // U+00C0 - U+00C7
 0x77777777, // U+00C8 - U+00CF
 0x77777777, // U+00D0 - U+00D7
 0x77777777, // U+00D8 - U+00DF
 0x77777777, // U+00E0 - U+00E7
 0x77777777, // U+00E8 - U+00EF
 0xA7777777, // U+00F0 - U+00F7
 0x77777777, // U+00F8 - U+00FF
 };
 
 static const PRUint32 gLBClass20[32] = {
 0xB5555555, // U+2000 - U+2007
 0x77775555, // U+2008 - U+200F
 0x777277B7, // U+2010 - U+2017
-0x77777777, // U+2018 - U+201F
+0x77A777A7, // U+2018 - U+201F
 0xAAAA7777, // U+2020 - U+2027
 0xB7777777, // U+2028 - U+202F
 0x77744444, // U+2030 - U+2037
 0x7A115107, // U+2038 - U+203F
 0x11017777, // U+2040 - U+2047
 0x77777711, // U+2048 - U+204F
 0x77777777, // U+2050 - U+2057
 0x77777777, // U+2058 - U+205F
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
@@ -302,32 +302,32 @@ static const PRUint16 gPair[MAX_CLASSES]
        7  0000 1110 1100 0110  = 0x0EC6
        8  0000 1110 1100 0010  = 0x0EC2
        9  0000 1110 1100 0010  = 0x0EC2
       [b] 0000 1100 0000 0010  = 0x0C02
       15  0000 1111 1101 1111  = 0x0FDF
       18  0000 1111 1101 1111  = 0x0FDF
  COMPLEX  0000 1111 1100 0010  = 0x0FC2
       [c] 0000 1111 1111 1111  = 0x0FFF
-      [d] 0000 1111 1101 1111  = 0x0EDF
+      [d] 0000 1111 1101 1111  = 0x0FDF
       [e] 0000 1111 1111 1111  = 0x0FFF
 */
 
 static const PRUint16 gPairConservative[MAX_CLASSES] = {
   0x0FFF,
   0x0EC2,
   0x0EC6,
   0x0EC2,
   0x0EC2,
   0x0C02,
   0x0FDF,
   0x0FDF,
   0x0FC2,
   0x0FFF,
-  0x0EDF,
+  0x0FDF,
   0x0FFF
 };
 
 
 /*
 
    9. Now we map the class to number
 
@@ -379,23 +379,29 @@ static const PRUint16 gPairConservative[
 #define U_SLASH     PRUnichar('/')
 #define U_SPACE     PRUnichar(' ')
 #define U_HYPHEN    PRUnichar('-')
 #define U_EQUAL     PRUnichar('=')
 #define U_PERCENT   PRUnichar('%')
 #define U_AMPERSAND PRUnichar('&')
 #define U_SEMICOLON PRUnichar(';')
 #define U_BACKSLASH PRUnichar('\\')
+#define U_OPEN_SINGLE_QUOTE PRUnichar(0x2018)
+#define U_OPEN_DOUBLE_QUOTE PRUnichar(0x201C)
+#define U_OPEN_GUILLEMET    PRUnichar(0x00AB)
 
 #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
                                      (c) == U_SLASH || \
                                      (c) == U_PERCENT || \
                                      (c) == U_AMPERSAND || \
                                      (c) == U_SEMICOLON || \
-                                     (c) == U_BACKSLASH)
+                                     (c) == U_BACKSLASH || \
+                                     (c) == U_OPEN_SINGLE_QUOTE || \
+                                     (c) == U_OPEN_DOUBLE_QUOTE || \
+                                     (c) == U_OPEN_GUILLEMET)
 
 #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
 
 static inline int
 GETCLASSFROMTABLE(const PRUint32* t, PRUint16 l)
 {
   return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
 }
@@ -714,16 +720,24 @@ ContextualAnalysis(PRUnichar prev, PRUni
           aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
         return CLASS_OPEN;
     }
   } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
     // If this may be a separator of params of URL, we should break after.
     if (!aState.UseConservativeBreaking(1) &&
         aState.HasCharacterAlready(U_EQUAL))
       return CLASS_CLOSE;
+  } else if (cur == U_OPEN_SINGLE_QUOTE ||
+             cur == U_OPEN_DOUBLE_QUOTE ||
+             cur == U_OPEN_GUILLEMET) {
+    // for CJK usage, we treat these as openers to allow a break before them,
+    // but otherwise treat them as normal characters because quote mark usage
+    // in various Western languages varies too much; see bug #450088 discussion.
+    if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
+      return CLASS_OPEN;
   } else {
     NS_ERROR("Forgot to handle the current character!");
   }
   return GetClass(cur);
 }
 
 
 PRInt32
--- a/intl/lwbrk/tools/anzx4501.html
+++ b/intl/lwbrk/tools/anzx4501.html
@@ -321,20 +321,20 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD></TD>
 <TD></TD>
 </TR>
 <TR><TH>07_18<TH>
 <TD>19</TD>
 <TD>157</TD>
 <TD></TD>
 <TD>33</TD>
-<TD>57</TD>
+<TD>56</TD>
 <TD>125</TD>
 <TD>3</TD>
-<TD BGCOLOR=white>394</TD>
+<TD BGCOLOR=white>393</TD>
 <TD></TD>
 <TD>19</TD>
 <TD></TD>
 <TD></TD>
 <TD>67</TD>
 <TD>5</TD>
 <TD>4</TD>
 <TD></TD>
@@ -343,18 +343,18 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD>3</TD>
 <TD>30</TD>
 <TD>4</TD>
 <TD>5</TD>
 <TD>2</TD>
-<TD>2</TD>
-<TD>4</TD>
+<TD></TD>
+<TD>5</TD>
 <TD>36</TD>
 <TD>4</TD>
 <TD></TD>
 <TD>3</TD>
 <TD>23</TD>
 <TD>99</TD>
 <TD>1</TD>
 <TD>1</TD>
@@ -438,20 +438,20 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD></TD>
 <TD></TD>
 </TR>
 <TR><TH>0A_[d]<TH>
 <TD>1</TD>
 <TD>2</TD>
 <TD></TD>
 <TD>6</TD>
-<TD>25</TD>
+<TD>26</TD>
 <TD>16</TD>
 <TD></TD>
-<TD BGCOLOR=white>50</TD>
+<TD BGCOLOR=white>51</TD>
 <TD></TD>
 <TD>1</TD>
 <TD></TD>
 <TD></TD>
 <TD>2</TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
@@ -460,18 +460,18 @@ Analysis of JIS X 4051 to Unicode Genera
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD>6</TD>
 <TD></TD>
 <TD></TD>
 <TD>3</TD>
-<TD>1</TD>
-<TD>1</TD>
+<TD>3</TD>
+<TD></TD>
 <TD>20</TD>
 <TD></TD>
 <TD>2</TD>
 <TD>3</TD>
 <TD>7</TD>
 <TD>4</TD>
 <TD></TD>
 <TD></TD>
@@ -575,20 +575,20 @@ Analysis of JIS X 4051 to Unicode Genera
 <TR><TH>00<TH>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD>33</TD>
 <TD>10</TD>
-<TD>126</TD>
+<TD>127</TD>
 <TD></TD>
 <TD>7</TD>
-<TD>45</TD>
+<TD>44</TD>
 <TD>2</TD>
 <TD></TD>
 </TR>
 <TR><TH>0E<TH>
 <TD>1</TD>
 <TD>6</TD>
 <TD></TD>
 <TD></TD>
@@ -605,20 +605,20 @@ Analysis of JIS X 4051 to Unicode Genera
 <TR><TH>20<TH>
 <TD>2</TD>
 <TD>8</TD>
 <TD>1</TD>
 <TD></TD>
 <TD>5</TD>
 <TD>12</TD>
 <TD></TD>
-<TD>104</TD>
+<TD>102</TD>
 <TD></TD>
 <TD></TD>
-<TD>5</TD>
+<TD>7</TD>
 <TD>3</TD>
 <TD></TD>
 </TR>
 <TR><TH>21<TH>
 <TD></TD>
 <TD></TD>
 <TD></TD>
 <TD>1</TD>
--- a/intl/lwbrk/tools/jisx4501class.txt
+++ b/intl/lwbrk/tools/jisx4501class.txt
@@ -19,16 +19,17 @@ 0060;;18
 0061;007A;18
 007B;;22
 007B;007E;23
 00A0;;24
 00A3;;22
 00A5;;22
 00A9;;18
 00AA;;18
+00AB;;18
 00AC;;22
 00AE;;18
 00AF;;18
 00A1;00BF;23
 00B0;;18
 00F7;;23
 00C0;00FF;18
 0E3F;;1
@@ -44,16 +45,18 @@ 2007;;24
 2000;200B;17
 200C;200F;18
 2010;;18
 2011;;24
 2012;2013;18
 2014;;7
 2015;;18
 2016;2017;18
+2019;;23
+201D;;23
 2018;201F;18
 2020;2023;18
 2024;2026;23
 2027;;23
 2028;202E;18
 202F;;24
 2030;2034;9
 2035;2038;18
new file mode 100644
--- /dev/null
+++ b/layout/reftests/line-breaking/quotationmarks-cjk-1-ref.html
@@ -0,0 +1,18 @@
+<html>
+<head>
+<style type="text/css"> p { margin: 5px 1em; width: 0; white-space: nowrap; } </style>
+</head>
+<body>
+
+<p>&#x5B57;<br>&#x2018;&#x5B57;&#x2019;<br>&#x5B57;<br>&#x201C;&#x5B57;&#x201D;<br>&#x5B57;</p>
+<p>&#x5B57;&#xFF0C;<br>&#x2018;&#x5B57;&#xFF0C;&#x2019;<br>&#x5B57;&#xFF0C;<br>&#x201C;&#x5B57;&#xFF0C;&#x201D;<br>&#x5B57;</p>
+<p>&#x5B57;&#xFF0E;<br>&#x2018;&#x5B57;&#xFF0E;&#x2019;<br>&#x5B57;&#xFF0E;<br>&#x201C;&#x5B57;&#xFF0E;&#x201D;<br>&#x5B57;</p>
+<p>&#x5B57;&#xFF1A;<br>&#x2018;&#x5B57;&#xFF1A;&#x2019;<br>&#x5B57;&#xFF1A;<br>&#x201C;&#x5B57;&#xFF1A;&#x201D;<br>&#x5B57;</p>
+<p>&#x5B57;&#xFF1B;<br>&#x2018;&#x5B57;&#xFF1B;&#x2019;<br>&#x5B57;&#xFF1B;<br>&#x201C;&#x5B57;&#xFF1B;&#x201D;<br>&#x5B57;</p>
+<p>&#x5B57;&#xFF01;<br>&#x2018;&#x5B57;&#xFF01;&#x2019;<br>&#x5B57;&#xFF01;<br>&#x201C;&#x5B57;&#xFF01;&#x201D;<br>&#x5B57;</p>
+<p>&#x5B57;&#xFF1F;<br>&#x2018;&#x5B57;&#xFF1F;&#x2019;<br>&#x5B57;&#xFF1F;<br>&#x201C;&#x5B57;&#xFF1F;&#x201D;<br>&#x5B57;</p>
+<p>&#x5B57;<br>&#x2018;&#xFF08;&#x5B57;&#xFF09;&#x2019;<br>&#x5B57;<br>&#x201C;&#xFF08;&#x5B57;&#xFF09;&#x201D;<br>&#x5B57;</p>
+<p>&#x5B57;<br>&#xFF08;&#x2018;&#x5B57;&#x2019;&#xFF09;<br>&#x5B57;<br>&#xFF08;&#x201C;&#x5B57;&#x201D;&#xFF09;<br>&#x5B57;</p>
+
+</body>
+</html>
new file mode 100644
--- /dev/null
+++ b/layout/reftests/line-breaking/quotationmarks-cjk-1.html
@@ -0,0 +1,19 @@
+<html>
+<head>
+<meta content-type="text/html" charset="utf8">
+<style type="text/css"> p { margin: 5px 1em; width: 0; } </style>
+</head>
+<body>
+
+<p>&#x5B57;&#x2018;&#x5B57;&#x2019;&#x5B57;&#x201C;&#x5B57;&#x201D;&#x5B57;</p>
+<p>&#x5B57;&#xFF0C;&#x2018;&#x5B57;&#xFF0C;&#x2019;&#x5B57;&#xFF0C;&#x201C;&#x5B57;&#xFF0C;&#x201D;&#x5B57;</p>
+<p>&#x5B57;&#xFF0E;&#x2018;&#x5B57;&#xFF0E;&#x2019;&#x5B57;&#xFF0E;&#x201C;&#x5B57;&#xFF0E;&#x201D;&#x5B57;</p>
+<p>&#x5B57;&#xFF1A;&#x2018;&#x5B57;&#xFF1A;&#x2019;&#x5B57;&#xFF1A;&#x201C;&#x5B57;&#xFF1A;&#x201D;&#x5B57;</p>
+<p>&#x5B57;&#xFF1B;&#x2018;&#x5B57;&#xFF1B;&#x2019;&#x5B57;&#xFF1B;&#x201C;&#x5B57;&#xFF1B;&#x201D;&#x5B57;</p>
+<p>&#x5B57;&#xFF01;&#x2018;&#x5B57;&#xFF01;&#x2019;&#x5B57;&#xFF01;&#x201C;&#x5B57;&#xFF01;&#x201D;&#x5B57;</p>
+<p>&#x5B57;&#xFF1F;&#x2018;&#x5B57;&#xFF1F;&#x2019;&#x5B57;&#xFF1F;&#x201C;&#x5B57;&#xFF1F;&#x201D;&#x5B57;</p>
+<p>&#x5B57;&#x2018;&#xFF08;&#x5B57;&#xFF09;&#x2019;&#x5B57;&#x201C;&#xFF08;&#x5B57;&#xFF09;&#x201D;&#x5B57;</p>
+<p>&#x5B57;&#xFF08;&#x2018;&#x5B57;&#x2019;&#xFF09;&#x5B57;&#xFF08;&#x201C;&#x5B57;&#x201D;&#xFF09;&#x5B57;</p>
+
+</body>
+</html>
--- a/layout/reftests/line-breaking/reftest.list
+++ b/layout/reftests/line-breaking/reftest.list
@@ -7,14 +7,15 @@
 == ja-2.html ja-2-ref.html
 == ja-3.html ja-3-ref.html
 == leaders-1.html leaders-1-ref.html
 == markup-src-1.html markup-src-1-ref.html
 == non-breakable-1.html non-breakable-1-ref.html
 == numerics-1.html numerics-1-ref.html
 == parentheses-1.html parentheses-1-ref.html
 == quotationmarks-1.html quotationmarks-1-ref.html
+== quotationmarks-cjk-1.html quotationmarks-cjk-1-ref.html
 == smileys-1.html smileys-1-ref.html
 == smileys-2.html smileys-2-ref.html
 == url-1.html url-1-ref.html
 == url-2.html url-2-ref.html
 == url-3.html url-3-ref.html
 == winpath-1.html winpath-1-ref.html