Bug 1342686 - Fix up the handling of surrogates in nsJISx4051LineBreaker. r=emk
authorJonathan Kew <jkew@mozilla.com>
Wed, 01 Mar 2017 22:47:56 +0000
changeset 394515 f7b0f3b271228e2f33e0f6ce641060862012f07d
parent 394514 84e809d7bf0b6da8cdbb02475cfbb2b97377c836
child 394516 42ddfa8072f309be87065547b6525c515fd14e30
push id1468
push userasasaki@mozilla.com
push dateMon, 05 Jun 2017 19:31:07 +0000
treeherdermozilla-release@0641fc6ee9d1 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersemk
bugs1342686
milestone54.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1342686 - Fix up the handling of surrogates in nsJISx4051LineBreaker. r=emk
intl/lwbrk/nsJISx4051LineBreaker.cpp
layout/reftests/line-breaking/reftest.list
--- a/intl/lwbrk/nsJISx4051LineBreaker.cpp
+++ b/intl/lwbrk/nsJISx4051LineBreaker.cpp
@@ -376,22 +376,23 @@ GETCLASSFROMTABLE(const uint32_t* t, uin
 
 static inline int
 IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
 {
   return ((0xff66 <= (u)) && ((u) <= 0xff70));
 }
 
 static inline int
-IS_CJK_CHAR(char16_t u)
+IS_CJK_CHAR(char32_t u)
 {
   return ((0x1100 <= (u) && (u) <= 0x11ff) ||
           (0x2e80 <= (u) && (u) <= 0xd7ff) ||
           (0xf900 <= (u) && (u) <= 0xfaff) ||
-          (0xff00 <= (u) && (u) <= 0xffef) );
+          (0xff00 <= (u) && (u) <= 0xffef) ||
+          (0x20000 <= (u) && (u) <= 0x2fffd));
 }
 
 static inline bool
 IS_NONBREAKABLE_SPACE(char16_t u)
 {
   return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
 }
 
@@ -590,36 +591,54 @@ nsJISx4051LineBreaker::nsJISx4051LineBre
 nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
 {
 }
 
 NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
 
 class ContextState {
 public:
-  ContextState(const char16_t* aText, uint32_t aLength) {
-    mUniText = aText;
-    mText = nullptr;
-    mLength = aLength;
+  ContextState(const char16_t* aText, uint32_t aLength)
+    : mUniText(aText)
+    , mText(nullptr)
+    , mLength(aLength)
+  {
+    Init();
+  }
+
+  ContextState(const uint8_t* aText, uint32_t aLength)
+    : mUniText(nullptr)
+    , mText(aText)
+    , mLength(aLength)
+  {
     Init();
   }
 
-  ContextState(const uint8_t* aText, uint32_t aLength) {
-    mUniText = nullptr;
-    mText = aText;
-    mLength = aLength;
-    Init();
+  uint32_t Length() const { return mLength; }
+  uint32_t Index() const { return mIndex; }
+
+  // This gets a single code unit of the text, without checking for surrogates
+  // (in the case of a 16-bit text buffer). That's OK if we're only checking for
+  // specific characters that are known to be BMP values.
+  char16_t GetCodeUnitAt(uint32_t aIndex) const {
+    MOZ_ASSERT(aIndex < mLength, "Out of range!");
+    return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
   }
 
-  uint32_t Length() { return mLength; }
-  uint32_t Index() { return mIndex; }
-
-  char16_t GetCharAt(uint32_t aIndex) {
-    NS_ASSERTION(aIndex < mLength, "Out of range!");
-    return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
+  // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs
+  // as necessary. It must ONLY be called for 16-bit text, not 8-bit.
+  char32_t GetUnicodeCharAt(uint32_t aIndex) const {
+    MOZ_ASSERT(mUniText, "Only for 16-bit text!");
+    MOZ_ASSERT(aIndex < mLength, "Out of range!");
+    char32_t c = mUniText[aIndex];
+    if (NS_IS_HIGH_SURROGATE(c) && aIndex + 1 < mLength &&
+        NS_IS_LOW_SURROGATE(mUniText[aIndex + 1])) {
+      c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]);
+    }
+    return c;
   }
 
   void AdvanceIndex() {
     ++mIndex;
   }
 
   void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
 
@@ -632,26 +651,28 @@ public:
 //   3. at near the latest broken point
 // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters,
 // which varies depending whether we are looking at a letter or a non-letter
 // character: for non-letters, we use an extended "conservative" range.
 
 #define CONSERVATIVE_RANGE_LETTER 2
 #define CONSERVATIVE_RANGE_OTHER  6
 
-  bool UseConservativeBreaking(uint32_t aOffset = 0) {
+  bool UseConservativeBreaking(uint32_t aOffset = 0) const {
     if (mHasCJKChar)
       return false;
     uint32_t index = mIndex + aOffset;
 
     // If the character at index is a letter (rather than various punctuation
     // characters, etc) then we want a shorter "conservative" range
     uint32_t conservativeRangeStart, conservativeRangeEnd;
     if (index < mLength &&
-        GetGenCategory(GetCharAt(index)) == nsIUGenCategory::kLetter) {
+        nsIUGenCategory::kLetter ==
+          (mText ? GetGenCategory(mText[index])
+                 : GetGenCategory(GetUnicodeCharAt(index)))) {
       // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start
       // to get more balanced behavior (if we break off a 2-letter prefix,
       // that means the break will actually be three letters from start of
       // word, to include the hyphen; whereas a 2-letter suffix will be
       // broken only two letters from end of word).
       conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER;
       conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1;
     } else {
@@ -664,22 +685,22 @@ public:
     if (result || !mHasNonbreakableSpace)
       return result;
 
     // This text has no-breakable space, we need to check whether the index
     // is near it.
 
     // Note that index is always larger than conservativeRange here.
     for (uint32_t i = index; index - conservativeRangeStart < i; --i) {
-      if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
+      if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1)))
         return true;
     }
     // Note that index is always less than mLength - conservativeRange.
     for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) {
-      if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
+      if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i)))
         return true;
     }
     return false;
   }
 
   bool HasPreviousEqualsSign() const {
     return mHasPreviousEqualsSign;
   }
@@ -708,49 +729,70 @@ public:
     mPreviousNonHyphenCharacter = ch;
   }
 
 private:
   void Init() {
     mIndex = 0;
     mLastBreakIndex = 0;
     mPreviousNonHyphenCharacter = U_NULL;
-    mHasCJKChar = 0;
-    mHasNonbreakableSpace = 0;
+    mHasCJKChar = false;
+    mHasNonbreakableSpace = false;
     mHasPreviousEqualsSign = false;
     mHasPreviousSlash = false;
     mHasPreviousBackslash = false;
 
-    for (uint32_t i = 0; i < mLength; ++i) {
-      char16_t u = GetCharAt(i);
-      if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
-        mHasNonbreakableSpace = 1;
-      else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
-        mHasCJKChar = 1;
+    if (mText) {
+      // 8-bit text: we only need to check for &nbsp;
+      for (uint32_t i = 0; i < mLength; ++i) {
+        if (IS_NONBREAKABLE_SPACE(mText[i])) {
+          mHasNonbreakableSpace = true;
+          break;
+        }
+      }
+    } else {
+      // 16-bit text: handle surrogates and check for CJK as well as &nbsp;
+      for (uint32_t i = 0; i < mLength; ++i) {
+        char32_t u = GetUnicodeCharAt(i);
+        if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) {
+          mHasNonbreakableSpace = true;
+          if (mHasCJKChar) {
+            break;
+          }
+        } else if (!mHasCJKChar && IS_CJK_CHAR(u)) {
+          mHasCJKChar = 1;
+          if (mHasNonbreakableSpace) {
+            break;
+          }
+        }
+        if (u > 0xFFFFu) {
+          ++i; // step over trailing low surrogate
+        }
+      }
     }
   }
 
-  const char16_t* mUniText;
-  const uint8_t* mText;
+  const char16_t* const mUniText;
+  const uint8_t* const mText;
 
   uint32_t mIndex;
-  uint32_t mLength;         // length of text
+  const uint32_t mLength;         // length of text
   uint32_t mLastBreakIndex;
-  uint32_t mPreviousNonHyphenCharacter; // The last character we have seen
+  char32_t mPreviousNonHyphenCharacter; // The last character we have seen
                                          // which is not U_HYPHEN
   bool mHasCJKChar; // if the text has CJK character, this is true.
   bool mHasNonbreakableSpace; // if the text has no-breakable space,
                                      // this is true.
   bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
   bool mHasPreviousSlash;      // True if we have seen a U_SLASH
   bool mHasPreviousBackslash;  // True if we have seen a U_BACKSLASH
 };
 
 static int8_t
-ContextualAnalysis(char16_t prev, char16_t cur, char16_t next,
+ContextualAnalysis(char32_t prev, char32_t cur, char32_t next,
                    ContextState &aState)
 {
   // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
 
   if (IS_HYPHEN(cur)) {
     // If next character is hyphen, we don't need to break between them.
     if (IS_HYPHEN(next))
       return CLASS_CHARACTER;
@@ -758,17 +800,17 @@ ContextualAnalysis(char16_t prev, char16
     // So, we should not break here.
     bool prevIsNum = IS_ASCII_DIGIT(prev);
     bool nextIsNum = IS_ASCII_DIGIT(next);
     if (prevIsNum && nextIsNum)
       return CLASS_NUMERIC;
     // If one side is numeric and the other is a character, or if both sides are
     // characters, the hyphen should be breakable.
     if (!aState.UseConservativeBreaking(1)) {
-      char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
+      char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
       if (prevOfHyphen && next) {
         int8_t prevClass = GetClass(prevOfHyphen);
         int8_t nextClass = GetClass(next);
         bool prevIsNumOrCharOrClose =
           prevIsNum ||
           (prevClass == CLASS_CHARACTER &&
             !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
           prevClass == CLASS_CLOSE ||
@@ -805,20 +847,20 @@ ContextualAnalysis(char16_t prev, char16
       }
 
       if (shouldReturn)
         return CLASS_OPEN;
     } else if (cur == U_PERCENT) {
       // If this is a part of the param of URL, we should break before.
       if (!aState.UseConservativeBreaking()) {
         if (aState.Index() >= 3 &&
-            aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
+            aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT)
           return CLASS_OPEN;
         if (aState.Index() + 3 < aState.Length() &&
-            aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
+            aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT)
           return CLASS_OPEN;
       }
     } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
       // If this may be a separator of params of URL, we should break after.
       if (!aState.UseConservativeBreaking(1) &&
           aState.HasPreviousEqualsSign())
         return CLASS_CLOSE;
     } else if (cur == U_OPEN_SINGLE_QUOTE ||
@@ -907,29 +949,39 @@ nsJISx4051LineBreaker::GetJISx4051Breaks
                                          uint8_t aWordBreak,
                                          uint8_t* aBreakBefore)
 {
   uint32_t cur;
   int8_t lastClass = CLASS_NONE;
   ContextState state(aChars, aLength);
 
   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
-    uint32_t ch = aChars[cur];
-    if (NS_IS_HIGH_SURROGATE(ch)) {
-      if (cur + 1 < aLength && NS_IS_LOW_SURROGATE(aChars[cur + 1])) {
-        ch = SURROGATE_TO_UCS4(ch, aChars[cur + 1]);
-      }
-    }
+    char32_t ch = state.GetUnicodeCharAt(cur);
+    uint32_t chLen = ch > 0xFFFFu ? 2 : 1;
     int8_t cl;
 
     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
-      cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
-                              ch,
-                              cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
-                              state);
+      char32_t prev, next;
+      if (cur > 0) {
+        // not using state.GetUnicodeCharAt() here because we're looking back
+        // rather than forward for possible surrogates
+        prev = aChars[cur - 1];
+        if (NS_IS_LOW_SURROGATE(prev) && cur > 1 &&
+            NS_IS_HIGH_SURROGATE(aChars[cur - 2])) {
+          prev = SURROGATE_TO_UCS4(aChars[cur - 2], prev);
+        }
+      } else {
+        prev = 0;
+      }
+      if (cur + chLen < aLength) {
+        next = state.GetUnicodeCharAt(cur + chLen);
+      } else {
+        next = 0;
+      }
+      cl = ContextualAnalysis(prev, ch, next, state);
     } else {
       if (ch == U_EQUAL)
         state.NotifySeenEqualsSign();
       state.NotifyNonHyphenCharacter(ch);
       cl = GetClass(ch);
     }
 
     bool allowBreak = false;
@@ -943,20 +995,27 @@ nsJISx4051LineBreaker::GetJISx4051Breaks
         allowBreak = true;
       }
     }
     aBreakBefore[cur] = allowBreak;
     if (allowBreak)
       state.NotifyBreakBefore();
     lastClass = cl;
     if (CLASS_COMPLEX == cl) {
-      uint32_t end = cur + 1;
+      uint32_t end = cur + chLen;
 
-      while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
+      while (end < aLength) {
+        char32_t c = state.GetUnicodeCharAt(end);
+        if (CLASS_COMPLEX != GetClass(c)) {
+          break;
+        }
         ++end;
+        if (c > 0xFFFFU) { // it was a surrogate pair
+          ++end;
+        }
       }
 
       NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
 
       // We have to consider word-break value again for complex characters
       if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
         // Respect word-break property 
         for (uint32_t i = cur; i < end; i++)
@@ -965,17 +1024,17 @@ nsJISx4051LineBreaker::GetJISx4051Breaks
 
       // restore breakability at chunk begin, which was always set to false
       // by the complex line breaker
       aBreakBefore[cur] = allowBreak;
 
       cur = end - 1;
     }
 
-    if (ch > 0xffff) {
+    if (chLen == 2) {
       // Supplementary-plane character: mark that we cannot break before the
       // trailing low surrogate, and advance past it.
       ++cur;
       aBreakBefore[cur] = false;
       state.AdvanceIndex();
     }
   }
 }
@@ -985,17 +1044,17 @@ nsJISx4051LineBreaker::GetJISx4051Breaks
                                          uint8_t aWordBreak,
                                          uint8_t* aBreakBefore)
 {
   uint32_t cur;
   int8_t lastClass = CLASS_NONE;
   ContextState state(aChars, aLength);
 
   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
-    char16_t ch = aChars[cur];
+    char32_t ch = aChars[cur];
     int8_t cl;
 
     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
                               ch,
                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
                               state);
     } else {
--- a/layout/reftests/line-breaking/reftest.list
+++ b/layout/reftests/line-breaking/reftest.list
@@ -29,17 +29,17 @@ random-if(cocoaWidget) == ja-3.html ja-3
 == quotationmarks-1.html quotationmarks-1-ref.html
 # The following is currently disabled on Linux because of a rendering issue with missing-glyph
 # representations on the test boxes. See bug #450088 for discussion.
 skip-if(gtkWidget) == quotationmarks-cjk-1.html quotationmarks-cjk-1-ref.html
 == smileys-1.html smileys-1-ref.html
 == smileys-2.html smileys-2-ref.html
 == space-cluster-1.html space-cluster-1-ref.html
 == space-cluster-2.html space-cluster-2-ref.html
-fails == surrogates-1.html surrogates-1-ref.html
-fails == surrogates-2.html surrogates-2-ref.html
-fails == surrogates-3.html surrogates-3-ref.html
-fails == surrogates-4.html surrogates-4-ref.html
+== surrogates-1.html surrogates-1-ref.html
+== surrogates-2.html surrogates-2-ref.html
+== surrogates-3.html surrogates-3-ref.html
+== surrogates-4.html surrogates-4-ref.html
 == url-1.html url-1-ref.html
 == url-2.html url-2-ref.html
 == url-3.html url-3-ref.html
 == winpath-1.html winpath-1-ref.html
 == zwnbsp-1.html zwnbsp-1-ref.html