Bug 1602526 - Part 1. Don't use implicit cast from char to uint32_t r=masayuki
authorMakoto Kato <m_kato@ga2.so-net.ne.jp>
Thu, 26 Dec 2019 03:34:49 +0000 (2019-12-26)
changeset 508373 8ae790b27fe0beb911aa0e050579f6c62eef56d1
parent 508372 7a8425c45bc2c40753be9675298a652ccc8a1242
child 508374 bd3843488a457fa721807dfe6d5f443b1c6d40f0
push id36952
push userncsoregi@mozilla.com
push dateThu, 26 Dec 2019 09:26:13 +0000 (2019-12-26)
treeherdermozilla-central@bd3843488a45 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmasayuki
bugs1602526, 1571339
milestone73.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1602526 - Part 1. Don't use implicit cast from char to uint32_t r=masayuki This is regerssion by bug 1571339. Non-ASCII character such as 0xf9 will be casted to 0xfff9 etc by implicit cast. So I should cast from char to uint8_t or add char version fucntions. Differential Revision: https://phabricator.services.mozilla.com/D58090
extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
--- a/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
+++ b/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
@@ -23,46 +23,66 @@
 #include <algorithm>
 
 using namespace mozilla;
 
 // IsIgnorableCharacter
 //
 //    These characters are ones that we should ignore in input.
 
+inline bool IsIgnorableCharacter(char ch) {
+  return (ch == static_cast<char>(0xAD));  // SOFT HYPHEN
+}
+
 inline bool IsIgnorableCharacter(char16_t ch) {
   return (ch == 0xAD ||   // SOFT HYPHEN
           ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN
 }
 
 // IsConditionalPunctuation
 //
 //    Some characters (like apostrophes) require characters on each side to be
 //    part of a word, and are otherwise punctuation.
 
+inline bool IsConditionalPunctuation(char ch) {
+  return (ch == '\'' ||       // RIGHT SINGLE QUOTATION MARK
+          ch == static_cast<char>(0xB7));  // MIDDLE DOT
+}
+
 inline bool IsConditionalPunctuation(char16_t ch) {
   return (ch == '\'' || ch == 0x2019 ||  // RIGHT SINGLE QUOTATION MARK
           ch == 0x00B7);                 // MIDDLE DOT
 }
 
 static bool IsAmbiguousDOMWordSeprator(char16_t ch) {
   // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
   return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' ||
           IsConditionalPunctuation(ch));
 }
 
+static bool IsAmbiguousDOMWordSeprator(char ch) {
+  // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
+  return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch));
+}
+
 // IsDOMWordSeparator
 //
 //    Determines if the given character should be considered as a DOM Word
 //    separator. Basically, this is whitespace, although it could also have
 //    certain punctuation that we know ALWAYS breaks words. This is important.
 //    For example, we can't have any punctuation that could appear in a URL
 //    or email address in this, because those need to always fit into a single
 //    DOM word.
 
+static bool IsDOMWordSeparator(char ch) {
+  // simple spaces or no-break space
+  return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ||
+          ch == static_cast<char>(0xA0));
+}
+
 static bool IsDOMWordSeparator(char16_t ch) {
   // simple spaces
   if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;
 
   // complex spaces - check only if char isn't ASCII (uncommon)
   if (ch >= 0xA0 && (ch == 0x00A0 ||  // NO-BREAK SPACE
                      ch == 0x2002 ||  // EN SPACE
                      ch == 0x2003 ||  // EM SPACE
@@ -409,30 +429,32 @@ struct MOZ_STACK_CLASS WordSplitState {
   // input. This checks for things that do not require special word-breaking
   // rules.
   bool ShouldSkipWord(int32_t aStart, int32_t aLength) const;
 
   // Checks to see if there's a DOM word separator before aBeforeOffset within
   // it. This function does not modify aSeparatorOffset when it returns false.
   bool GetDOMWordSeparatorOffset(int32_t aOffset,
                                  int32_t* aSeparatorOffset) const;
+
+  char16_t GetUnicharAt(int32_t aIndex) const;
 };
 
 // WordSplitState::ClassifyCharacter
 template <class T>
 CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex,
                                                bool aRecurse) const {
   NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
                "Index out of range");
   if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR;
 
   // this will classify the character, we want to treat "ignorable" characters
   // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
   nsUGenCategory charCategory =
-      mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
+      mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex));
   if (charCategory == nsUGenCategory::kLetter ||
       IsIgnorableCharacter(mDOMWordText[aIndex]) ||
       mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
       mDOMWordText[aIndex] == 0x200D /* ZWJ */)
     return CHAR_CLASS_WORD;
 
   // If conditional punctuation is surrounded immediately on both sides by word
   // characters it also counts as a word character.
@@ -583,17 +605,17 @@ bool WordSplitState<T>::IsSpecialWord() 
 
 // WordSplitState::ShouldSkipWord
 template <class T>
 bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const {
   int32_t last = aStart + aLength;
 
   // check to see if the word contains a digit
   for (int32_t i = aStart; i < last; i++) {
-    if (unicode::GetGenCategory(mDOMWordText[i]) == nsUGenCategory::kNumber) {
+    if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) == nsUGenCategory::kNumber) {
       return true;
     }
   }
 
   // not special
   return false;
 }
 
@@ -616,16 +638,28 @@ bool WordSplitState<T>::GetDOMWordSepara
       }
       *aSeparatorOffset = i;
       return true;
     }
   }
   return false;
 }
 
+template <>
+char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt(
+    int32_t aIndex) const {
+  return mDOMWordText[aIndex];
+}
+
+template <>
+char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt(
+    int32_t aIndex) const {
+  return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex]));
+}
+
 static inline bool IsBRElement(nsINode* aNode) {
   return aNode->IsHTMLElement(nsGkAtoms::br);
 }
 
 /**
  * Given a TextNode, checks to see if there's a DOM word separator before
  * aBeforeOffset within it. This function does not modify aSeparatorOffset when
  * it returns false.