Bug 809020 - Use a shorter "conservative breaking" range at word edges when dealing with letters rather than punctuation etc., and adjust existing tests accordingly. r=masayuki
authorJonathan Kew <jkew@mozilla.com>
Wed, 01 Mar 2017 22:47:56 +0000
changeset 374563 e980c683af8cb856dbb7409373d2cef7972fb4ca
parent 374562 283f43e8790b268c6f5316b007fddb6658b6dfba
child 374564 84e809d7bf0b6da8cdbb02475cfbb2b97377c836
push id10863
push userjlorenzo@mozilla.com
push dateMon, 06 Mar 2017 23:02:23 +0000
treeherdermozilla-aurora@0931190cd725 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmasayuki
bugs809020
milestone54.0a1
Bug 809020 - Use a shorter "conservative breaking" range at word edges when dealing with letters rather than punctuation etc., and adjust existing tests accordingly. r=masayuki
intl/lwbrk/nsJISx4051LineBreaker.cpp
layout/reftests/line-breaking/datetime-1-ref.html
layout/reftests/line-breaking/hyphens-1-ref.html
layout/reftests/line-breaking/hyphens-1.html
layout/reftests/line-breaking/reftest.list
layout/reftests/line-breaking/url-3-ref.html
--- a/intl/lwbrk/nsJISx4051LineBreaker.cpp
+++ b/intl/lwbrk/nsJISx4051LineBreaker.cpp
@@ -7,16 +7,18 @@
 
 #include "nsJISx4051LineBreaker.h"
 
 #include "jisx4051class.h"
 #include "nsComplexBreaker.h"
 #include "nsTArray.h"
 #include "nsUnicodeProperties.h"
 
+using namespace mozilla::unicode;
+
 /* 
 
    Simplification of Pair Table in JIS X 4051
 
    1. The Origion Table - in 4.1.3
 
    In JIS x 4051. The pair table is defined as below
 
@@ -623,39 +625,60 @@ public:
 
 // A word of western language should not be broken. But even if the word has
 // only ASCII characters, non-natural context words should be broken, e.g.,
 // URL and file path. For protecting the natural words, we should use
 // conservative breaking rules at following conditions:
 //   1. at near the start of word
 //   2. at near the end of word
 //   3. at near the latest broken point
-// CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
-#define CONSERVATIVE_BREAK_RANGE 6
+// CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters,
+// which varies depending whether we are looking at a letter or a non-letter
+// character: for non-letters, we use an extended "conservative" range.
+
+#define CONSERVATIVE_RANGE_LETTER 2
+#define CONSERVATIVE_RANGE_OTHER  6
 
   bool UseConservativeBreaking(uint32_t aOffset = 0) {
     if (mHasCJKChar)
       return false;
     uint32_t index = mIndex + aOffset;
-    bool result = (index < CONSERVATIVE_BREAK_RANGE ||
-                     mLength - index < CONSERVATIVE_BREAK_RANGE ||
-                     index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
+
+    // If the character at index is a letter (rather than various punctuation
+    // characters, etc) then we want a shorter "conservative" range
+    uint32_t conservativeRangeStart, conservativeRangeEnd;
+    if (index < mLength &&
+        GetGenCategory(GetCharAt(index)) == nsIUGenCategory::kLetter) {
+      // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start
+      // to get more balanced behavior (if we break off a 2-letter prefix,
+      // that means the break will actually be three letters from start of
+      // word, to include the hyphen; whereas a 2-letter suffix will be
+      // broken only two letters from end of word).
+      conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER;
+      conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1;
+    } else {
+      conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER;
+    }
+
+    bool result = (index < conservativeRangeStart ||
+                     mLength - index < conservativeRangeEnd ||
+                     index - mLastBreakIndex < conservativeRangeStart);
     if (result || !mHasNonbreakableSpace)
       return result;
 
     // This text has no-breakable space, we need to check whether the index
     // is near it.
 
-    // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
-    for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
+    // Note that index is always larger than conservativeRange here.
+    for (uint32_t i = index; index - conservativeRangeStart < i; --i) {
       if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
         return true;
     }
-    // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
-    for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
+    // Note that index is always less than mLength - conservativeRange.
+    for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) {
       if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
         return true;
     }
     return false;
   }
 
   bool HasPreviousEqualsSign() const {
     return mHasPreviousEqualsSign;
--- a/layout/reftests/line-breaking/datetime-1-ref.html
+++ b/layout/reftests/line-breaking/datetime-1-ref.html
@@ -1,19 +1,19 @@
 <html>
 <head>
 <style type="text/css"> p { margin: 5px 1em; width: 0; white-space: nowrap; } </style>
 </head>
 <body>
 
 <p>2007-01-01</p>
-<p>2007-Jan-01</p>
+<p>2007-<br>Jan-01</p>
 <p>Jan-01-2007</p>
 <p>2007-01-01&nbsp;00:00:00</p>
-<p>2007-Jan-01&nbsp;00:00:00</p>
+<p>2007-<br>Jan-01&nbsp;00:00:00</p>
 <p>Jan-01-2007&nbsp;00:00:00</p>
 
 <p>2007/01/01</p>
 <p>2007/Jan/01</p>
 <p>Jan/01/2007</p>
 <p>2007/01/01&nbsp;00:00:00</p>
 <p>2007/Jan/01&nbsp;00:00:00</p>
 <p>Jan/01/2007&nbsp;00:00:00</p>
--- a/layout/reftests/line-breaking/hyphens-1-ref.html
+++ b/layout/reftests/line-breaking/hyphens-1-ref.html
@@ -1,41 +1,41 @@
 <html>
 <head>
 <style type="text/css"> p { margin: 5px 1em; width: 0; white-space: nowrap; } </style>
 </head>
 <body>
 
-<p>abcde-<br>abcdef</p>
-<p>abcd-abcdef</p>
-<p>abcde-abcde</p>
+<p>ab-<br>ab</p>
+<p>a-ab</p>
+<p>ab-a</p>
 <p>abcdef--<br>abcdef</p>
 <p>------abcdef<p>
 
 <!-- U+058A is ARMENIAN HYPHEN -->
-<p>abcde&#x058A;<br>abcdef</p>
-<p>abcd&#x058A;abcdef</p>
-<p>abcde&#x058A;abcde</p>
-<p>abcde&#x058A;&#x058A;<br>abcdef</p>
-<p>&#x058A;&#x058A;&#x058A;&#x058A;&#x058A;&#x058A;abcdef</p>
+<p>ab&#x058A;<br>ab</p>
+<p>a&#x058A;ab</p>
+<p>ab&#x058A;a</p>
+<p>abcdef&#x058A;&#x058A;<br>abcdef</p>
+<p>&#x058A;&#x058A;&#x058A;&#x058A;&#x058A;&#x058A;abcdef<p>
 
 <!-- U+2010 is HYPHEN -->
-<p>abcde&#x2010;<br>abcdef</p>
-<p>abcd&#x2010;abcdef</p>
-<p>abcde&#x2010;abcde</p>
-<p>abcde&#x2010;&#x2010;<br>abcdef</p>
-<p>&#x2010;&#x2010;&#x2010;&#x2010;&#x2010;&#x2010;abcdef</p>
+<p>ab&#x2010;<br>ab</p>
+<p>a&#x2010;ab</p>
+<p>ab&#x2010;a</p>
+<p>abcdef&#x2010;&#x2010;<br>abcdef</p>
+<p>&#x2010;&#x2010;&#x2010;&#x2010;&#x2010;&#x2010;abcdef<p>
 
 <!-- U+2012 is FIGURE DASH -->
-<p>abcde&#x2012;<br>abcdef</p>
-<p>abcd&#x2012;abcdef</p>
-<p>abcde&#x2012;abcde</p>
-<p>abcde&#x2012;&#x2012;<br>abcdef</p>
-<p>&#x2012;&#x2012;&#x2012;&#x2012;&#x2012;&#x2012;abcdef</p>
+<p>ab&#x2012;<br>ab</p>
+<p>a&#x2012;ab</p>
+<p>ab&#x2012;a</p>
+<p>abcdef&#x2012;&#x2012;<br>abcdef</p>
+<p>&#x2012;&#x2012;&#x2012;&#x2012;&#x2012;&#x2012;abcdef<p>
 
-<p>abcde&ndash;<br>abcdef</p>
-<p>abcd&ndash;abcdef</p>
-<p>abcde&ndash;abcde</p>
-<p>abcde&ndash;&ndash;<br>abcdef</p>
-<p>&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;abcdef</p>
+<p>ab&ndash;<br>ab</p>
+<p>a&ndash;ab</p>
+<p>ab&ndash;a</p>
+<p>abcdef&ndash;&ndash;<br>abcdef</p>
+<p>&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;abcdef<p>
 
 </body>
 </html>
--- a/layout/reftests/line-breaking/hyphens-1.html
+++ b/layout/reftests/line-breaking/hyphens-1.html
@@ -1,41 +1,41 @@
 <html>
 <head>
 <style type="text/css"> p { margin: 5px 1em; width: 0; } </style>
 </head>
 <body>
 
-<p>abcde-abcdef</p>
-<p>abcd-abcdef</p>
-<p>abcde-abcde</p>
+<p>ab-ab</p>
+<p>a-ab</p>
+<p>ab-a</p>
 <p>abcdef--abcdef</p>
 <p>------abcdef<p>
 
 <!-- U+058A is ARMENIAN HYPHEN -->
-<p>abcde&#x058A;abcdef</p>
-<p>abcd&#x058A;abcdef</p>
-<p>abcde&#x058A;abcde</p>
-<p>abcde&#x058A;&#x058A;abcdef</p>
-<p>&#x058A;&#x058A;&#x058A;&#x058A;&#x058A;&#x058A;abcdef</p>
+<p>ab&#x058A;ab</p>
+<p>a&#x058A;ab</p>
+<p>ab&#x058A;a</p>
+<p>abcdef&#x058A;&#x058A;abcdef</p>
+<p>&#x058A;&#x058A;&#x058A;&#x058A;&#x058A;&#x058A;abcdef<p>
 
 <!-- U+2010 is HYPHEN -->
-<p>abcde&#x2010;abcdef</p>
-<p>abcd&#x2010;abcdef</p>
-<p>abcde&#x2010;abcde</p>
-<p>abcde&#x2010;&#x2010;abcdef</p>
-<p>&#x2010;&#x2010;&#x2010;&#x2010;&#x2010;&#x2010;abcdef</p>
+<p>ab&#x2010;ab</p>
+<p>a&#x2010;ab</p>
+<p>ab&#x2010;a</p>
+<p>abcdef&#x2010;&#x2010;abcdef</p>
+<p>&#x2010;&#x2010;&#x2010;&#x2010;&#x2010;&#x2010;abcdef<p>
 
 <!-- U+2012 is FIGURE DASH -->
-<p>abcde&#x2012;abcdef</p>
-<p>abcd&#x2012;abcdef</p>
-<p>abcde&#x2012;abcde</p>
-<p>abcde&#x2012;&#x2012;abcdef</p>
-<p>&#x2012;&#x2012;&#x2012;&#x2012;&#x2012;&#x2012;abcdef</p>
+<p>ab&#x2012;ab</p>
+<p>a&#x2012;ab</p>
+<p>ab&#x2012;a</p>
+<p>abcdef&#x2012;&#x2012;abcdef</p>
+<p>&#x2012;&#x2012;&#x2012;&#x2012;&#x2012;&#x2012;abcdef<p>
 
-<p>abcde&ndash;abcdef</p>
-<p>abcd&ndash;abcdef</p>
-<p>abcde&ndash;abcde</p>
-<p>abcde&ndash;&ndash;abcdef</p>
-<p>&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;abcdef</p>
+<p>ab&ndash;ab</p>
+<p>a&ndash;ab</p>
+<p>ab&ndash;a</p>
+<p>abcdef&ndash;&ndash;abcdef</p>
+<p>&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;abcdef<p>
 
 </body>
 </html>
--- a/layout/reftests/line-breaking/reftest.list
+++ b/layout/reftests/line-breaking/reftest.list
@@ -1,11 +1,11 @@
 == between-whitespaces.html between-whitespaces-ref.html
 == chemical-1.html chemical-1-ref.html
-fails == conservative-range-1.html conservative-range-1-ref.html
+== conservative-range-1.html conservative-range-1-ref.html
 == conservative-range-2.html conservative-range-2-ref.html
 == currency-1.html currency-1-ref.html
 == currency-2.html currency-2-ref.html
 == datetime-1.html datetime-1-ref.html
 == emoji-1.html emoji-1-ref.html
 == emoji-2.html emoji-2-ref.html
 == hyphens-1.html hyphens-1-ref.html
 == hyphens-2.html hyphens-2-ref.html
--- a/layout/reftests/line-breaking/url-3-ref.html
+++ b/layout/reftests/line-breaking/url-3-ref.html
@@ -1,14 +1,14 @@
 <html>
 <head>
 <style type="text/css"> p { margin: 5px 1em; width: 0; white-space: nowrap; } </style>
 </head>
 <body>
 
-<p>index.cgi?abcdef=<br>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA<br>%9E&amp;abcdef=<br>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA%9E</p>
-<p>index.cgi?abcdef=<br>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA<br>%9E;abcdef=<br>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA%9E</p>
+<p>index.cgi?abcdef=<br>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA<br>%9E&amp;<br>abcdef=<br>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA%9E</p>
+<p>index.cgi?abcdef=<br>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA<br>%9E;<br>abcdef=<br>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA%9E</p>
 <p>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA%9E</p>
 <p>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA%9Eab</p>
 <p>%E6%97<br>%A5%E6<br>%9C%AC<br>%E8%AA<br>%9Eabc</p>
 
 </body>
 </html>