Bug 548495. Eliminate sources of O(N^2) backtracking in the linebreaker. r=masayuki
authorBoris Zbarsky <bzbarsky@mit.edu>
Fri, 26 Feb 2010 21:32:31 -0500
changeset 33674 ae59a9d6b71990edee2fbfe8bcd6842b81b04216
parent 33673 9185a0d685737906932ee0734e2622855883659c
child 33675 82fa604cdf23ef0ebd1d5f0779facc322610eb28
push id1120
push userbzbarsky@mozilla.com
push dateSat, 06 Mar 2010 03:42:14 +0000
reviewersmasayuki
bugs548495
milestone1.9.2.2pre
Bug 548495. Eliminate sources of O(N^2) backtracking in the linebreaker. r=masayuki
intl/lwbrk/src/nsJISx4501LineBreaker.cpp
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
@@ -576,20 +576,18 @@ public:
   PRUint32 Length() { return mLength; }
   PRUint32 Index() { return mIndex; }
 
   PRUnichar GetCharAt(PRUint32 aIndex) {
     NS_ASSERTION(0 <= aIndex && aIndex < mLength, "Out of range!");
     return mUniText ? mUniText[aIndex] : PRUnichar(mText[aIndex]);
   }
 
-  void AdvanceIndexTo(PRUint32 aIndex) {
-    NS_ASSERTION(mIndex <= aIndex, "the index cannot decrease.");
-    NS_ASSERTION(aIndex < mLength, "out of range");
-    mIndex = aIndex;
+  void AdvanceIndex() {
+    ++mIndex;
   }
 
   void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
 
 // A word of western language should not be broken. But even if the word has
 // only ASCII characters, non-natural context words should be broken, e.g.,
 // URL and file path. For protecting the natural words, we should use
 // conservative breaking rules at following conditions:
@@ -620,62 +618,78 @@ public:
     // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
     for (PRUint32 i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
       if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
         return PR_TRUE;
     }
     return PR_FALSE;
   }
 
-  PRBool HasCharacterAlready(PRUnichar aCh) {
-    // Be careful for the index being unsigned.
-    for (PRUint32 i = mIndex; i > 0; --i) {
-      if (GetCharAt(i - 1) == aCh)
-        return PR_TRUE;
-    }
-    return PR_FALSE;
+  PRBool HasPreviousEqualsSign() const {
+    return mHasPreviousEqualsSign;
+  }
+  void NotifySeenEqualsSign() {
+    mHasPreviousEqualsSign = PR_TRUE;
   }
 
-  PRUnichar GetPreviousNonHyphenCharacter() {
-    NS_ASSERTION(IS_HYPHEN(GetCharAt(mIndex)),
-                 "current character isn't hyphen");
-    // Be careful for the index being unsigned.
-    for (PRUint32 i = mIndex; i > 0; --i) {
-      PRUnichar ch = GetCharAt(i - 1);
-      if (!IS_HYPHEN(ch))
-        return ch;
-    }
-    return U_NULL;
+  PRBool HasPreviousSlash() const {
+    return mHasPreviousSlash;
+  }
+  void NotifySeenSlash() {
+    mHasPreviousSlash = PR_TRUE;
+  }
+
+  PRBool HasPreviousBackslash() const {
+    return mHasPreviousBackslash;
+  }
+  void NotifySeenBackslash() {
+    mHasPreviousBackslash = PR_TRUE;
+  }
+
+  PRUnichar GetPreviousNonHyphenCharacter() const {
+    return mPreviousNonHyphenCharacter;
+  }
+  void NotifyNonHyphenCharacter(PRUnichar ch) {
+    mPreviousNonHyphenCharacter = ch;
   }
 
 private:
   void Init() {
     mIndex = 0;
     mLastBreakIndex = 0;
+    mPreviousNonHyphenCharacter = U_NULL;
     mHasCJKChar = 0;
     mHasNonbreakableSpace = 0;
+    mHasPreviousEqualsSign = PR_FALSE;
+    mHasPreviousSlash = PR_FALSE;
+    mHasPreviousBackslash = PR_FALSE;
 
     for (PRUint32 i = 0; i < mLength; ++i) {
       PRUnichar u = GetCharAt(i);
       if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
         mHasNonbreakableSpace = 1;
       else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
         mHasCJKChar = 1;
     }
   }
 
   const PRUnichar* mUniText;
   const PRUint8* mText;
 
   PRUint32 mIndex;
   PRUint32 mLength;         // length of text
   PRUint32 mLastBreakIndex;
+  PRUnichar mPreviousNonHyphenCharacter; // The last character we have seen
+                                         // which is not U_HYPHEN
   PRPackedBool mHasCJKChar; // if the text has CJK character, this is true.
   PRPackedBool mHasNonbreakableSpace; // if the text has no-breakable space,
                                      // this is true.
+  PRPackedBool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
+  PRPackedBool mHasPreviousSlash;      // True if we have seen a U_SLASH
+  PRPackedBool mHasPreviousBackslash;  // True if we have seen a U_BACKSLASH
 };
 
 static PRInt8
 ContextualAnalysis(PRUnichar prev, PRUnichar cur, PRUnichar next,
                    ContextState &aState)
 {
   // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
 
@@ -697,49 +711,62 @@ ContextualAnalysis(PRUnichar prev, PRUni
         PRBool prevIsChar = !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen) &&
                             GetClass(prevOfHyphen) == CLASS_CHARACTER;
         PRBool nextIsChar = !NEED_CONTEXTUAL_ANALYSIS(next) &&
                             GetClass(next) == CLASS_CHARACTER;
         if ((prevIsNum || prevIsChar) && (nextIsNum || nextIsChar))
           return CLASS_CLOSE;
       }
     }
-  } else if (cur == U_SLASH || cur == U_BACKSLASH) {
-    // If this is immediately after same char, we should not break here.
-    if (prev == cur)
-      return CLASS_CHARACTER;
-    // If this text has two or more (BACK)SLASHs, this may be file path or URL.
-    if (!aState.UseConservativeBreaking() &&
-        aState.HasCharacterAlready(cur))
-      return CLASS_OPEN;
-  } else if (cur == U_PERCENT) {
-    // If this is a part of the param of URL, we should break before.
-    if (!aState.UseConservativeBreaking()) {
-      if (aState.Index() >= 3 &&
-          aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
-        return CLASS_OPEN;
-      if (aState.Index() + 3 < aState.Length() &&
-          aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
+  } else {
+    aState.NotifyNonHyphenCharacter(cur);
+    if (cur == U_SLASH || cur == U_BACKSLASH) {
+      // If this is immediately after same char, we should not break here.
+      if (prev == cur)
+        return CLASS_CHARACTER;
+      // If this text has two or more (BACK)SLASHs, this may be file path or URL.
+      // Make sure to compute shouldReturn before we notify on this slash.
+      PRBool shouldReturn = !aState.UseConservativeBreaking() &&
+        (cur == U_SLASH ?
+         aState.HasPreviousSlash() : aState.HasPreviousBackslash());
+
+      if (cur == U_SLASH) {
+        aState.NotifySeenSlash();
+      } else {
+        aState.NotifySeenBackslash();
+      }
+
+      if (shouldReturn)
         return CLASS_OPEN;
+    } else if (cur == U_PERCENT) {
+      // If this is a part of the param of URL, we should break before.
+      if (!aState.UseConservativeBreaking()) {
+        if (aState.Index() >= 3 &&
+            aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
+          return CLASS_OPEN;
+        if (aState.Index() + 3 < aState.Length() &&
+            aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
+          return CLASS_OPEN;
+      }
+    } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
+      // If this may be a separator of params of URL, we should break after.
+      if (!aState.UseConservativeBreaking(1) &&
+          aState.HasPreviousEqualsSign())
+        return CLASS_CLOSE;
+    } else if (cur == U_OPEN_SINGLE_QUOTE ||
+               cur == U_OPEN_DOUBLE_QUOTE ||
+               cur == U_OPEN_GUILLEMET) {
+      // for CJK usage, we treat these as openers to allow a break before them,
+      // but otherwise treat them as normal characters because quote mark usage
+      // in various Western languages varies too much; see bug #450088 discussion.
+      if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
+        return CLASS_OPEN;
+    } else {
+      NS_ERROR("Forgot to handle the current character!");
     }
-  } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
-    // If this may be a separator of params of URL, we should break after.
-    if (!aState.UseConservativeBreaking(1) &&
-        aState.HasCharacterAlready(U_EQUAL))
-      return CLASS_CLOSE;
-  } else if (cur == U_OPEN_SINGLE_QUOTE ||
-             cur == U_OPEN_DOUBLE_QUOTE ||
-             cur == U_OPEN_GUILLEMET) {
-    // for CJK usage, we treat these as openers to allow a break before them,
-    // but otherwise treat them as normal characters because quote mark usage
-    // in various Western languages varies too much; see bug #450088 discussion.
-    if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
-      return CLASS_OPEN;
-  } else {
-    NS_ERROR("Forgot to handle the current character!");
   }
   return GetClass(cur);
 }
 
 
 PRInt32
 nsJISx4051LineBreaker::WordMove(const PRUnichar* aText, PRUint32 aLen,
                                 PRUint32 aPos, PRInt8 aDirection)
@@ -807,27 +834,29 @@ nsJISx4051LineBreaker::Prev(const PRUnic
 void
 nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUnichar* aChars, PRUint32 aLength,
                                          PRPackedBool* aBreakBefore)
 {
   PRUint32 cur;
   PRInt8 lastClass = CLASS_NONE;
   ContextState state(aChars, aLength);
 
-  for (cur = 0; cur < aLength; ++cur) {
+  for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
     PRUnichar ch = aChars[cur];
     PRInt8 cl;
-    state.AdvanceIndexTo(cur);
 
     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
                               ch,
                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
                               state);
     } else {
+      if (ch == U_EQUAL)
+        state.NotifySeenEqualsSign();
+      state.NotifyNonHyphenCharacter(ch);
       cl = GetClass(ch);
     }
 
     PRBool allowBreak;
     if (cur > 0) {
       NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
                    "Loop should have prevented adjacent complex chars here");
       if (state.UseConservativeBreaking())
@@ -862,27 +891,29 @@ nsJISx4051LineBreaker::GetJISx4051Breaks
 void
 nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUint8* aChars, PRUint32 aLength,
                                          PRPackedBool* aBreakBefore)
 {
   PRUint32 cur;
   PRInt8 lastClass = CLASS_NONE;
   ContextState state(aChars, aLength);
 
-  for (cur = 0; cur < aLength; ++cur) {
+  for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
     PRUnichar ch = aChars[cur];
     PRInt8 cl;
-    state.AdvanceIndexTo(cur);
 
     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
                               ch,
                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
                               state);
     } else {
+      if (ch == U_EQUAL)
+        state.NotifySeenEqualsSign();
+      state.NotifyNonHyphenCharacter(ch);
       cl = GetClass(ch);
     }
 
     PRBool allowBreak;
     if (cur > 0) {
       if (state.UseConservativeBreaking())
         allowBreak = GetPairConservative(lastClass, cl);
       else