mozilla-central: changeset 508373:8ae790b27fe0beb911aa0e050579f6c62eef56d1

author	Makoto Kato <m_kato@ga2.so-net.ne.jp>
	Thu, 26 Dec 2019 03:34:49 +0000 (2019-12-26)
changeset 508373	8ae790b27fe0beb911aa0e050579f6c62eef56d1
parent 508372	7a8425c45bc2c40753be9675298a652ccc8a1242
child 508374	bd3843488a457fa721807dfe6d5f443b1c6d40f0
push id	36952
push user	ncsoregi@mozilla.com
push date	Thu, 26 Dec 2019 09:26:13 +0000 (2019-12-26)
treeherder	mozilla-central@bd3843488a45 [default view] [failures only]
perfherder	[talos] [build metrics] [platform microbench] (compared to previous push)
reviewers	masayuki
bugs	1602526, 1571339
milestone	73.0a1
first release with	nightly linux32 bd3843488a45 / 73.0a1 / 20191226092613 / files nightly linux64 bd3843488a45 / 73.0a1 / 20191226092613 / files nightly mac bd3843488a45 / 73.0a1 / 20191226092613 / files nightly win32 bd3843488a45 / 73.0a1 / 20191226092613 / files nightly win64 bd3843488a45 / 73.0a1 / 20191226092613 / files
last release without	nightly linux32 6f0d0f918cbf / 73.0a1 / 20191225215646 / files nightly linux64 6f0d0f918cbf / 73.0a1 / 20191225215646 / files nightly mac 6f0d0f918cbf / 73.0a1 / 20191225215646 / files nightly win32 6f0d0f918cbf / 73.0a1 / 20191225215646 / files nightly win64 6f0d0f918cbf / 73.0a1 / 20191225215646 / files

--- a/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
+++ b/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
@@ -23,46 +23,66 @@
 #include <algorithm>
 
 using namespace mozilla;
 
 // IsIgnorableCharacter
 //
 //    These characters are ones that we should ignore in input.
 
+inline bool IsIgnorableCharacter(char ch) {
+  return (ch == static_cast<char>(0xAD));  // SOFT HYPHEN
+}
+
 inline bool IsIgnorableCharacter(char16_t ch) {
   return (ch == 0xAD ||   // SOFT HYPHEN
           ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN
 }
 
 // IsConditionalPunctuation
 //
 //    Some characters (like apostrophes) require characters on each side to be
 //    part of a word, and are otherwise punctuation.
 
+inline bool IsConditionalPunctuation(char ch) {
+  return (ch == '\'' ||       // RIGHT SINGLE QUOTATION MARK
+          ch == static_cast<char>(0xB7));  // MIDDLE DOT
+}
+
 inline bool IsConditionalPunctuation(char16_t ch) {
   return (ch == '\'' || ch == 0x2019 ||  // RIGHT SINGLE QUOTATION MARK
           ch == 0x00B7);                 // MIDDLE DOT
 }
 
 static bool IsAmbiguousDOMWordSeprator(char16_t ch) {
   // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
   return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' ||
           IsConditionalPunctuation(ch));
 }
 
+static bool IsAmbiguousDOMWordSeprator(char ch) {
+  // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
+  return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch));
+}
+
 // IsDOMWordSeparator
 //
 //    Determines if the given character should be considered as a DOM Word
 //    separator. Basically, this is whitespace, although it could also have
 //    certain punctuation that we know ALWAYS breaks words. This is important.
 //    For example, we can't have any punctuation that could appear in a URL
 //    or email address in this, because those need to always fit into a single
 //    DOM word.
 
+static bool IsDOMWordSeparator(char ch) {
+  // simple spaces or no-break space
+  return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ||
+          ch == static_cast<char>(0xA0));
+}
+
 static bool IsDOMWordSeparator(char16_t ch) {
   // simple spaces
   if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;
 
   // complex spaces - check only if char isn't ASCII (uncommon)
   if (ch >= 0xA0 && (ch == 0x00A0 ||  // NO-BREAK SPACE
                      ch == 0x2002 ||  // EN SPACE
                      ch == 0x2003 ||  // EM SPACE
@@ -409,30 +429,32 @@ struct MOZ_STACK_CLASS WordSplitState {
   // input. This checks for things that do not require special word-breaking
   // rules.
   bool ShouldSkipWord(int32_t aStart, int32_t aLength) const;
 
   // Checks to see if there's a DOM word separator before aBeforeOffset within
   // it. This function does not modify aSeparatorOffset when it returns false.
   bool GetDOMWordSeparatorOffset(int32_t aOffset,
                                  int32_t* aSeparatorOffset) const;
+
+  char16_t GetUnicharAt(int32_t aIndex) const;
 };
 
 // WordSplitState::ClassifyCharacter
 template <class T>
 CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex,
                                                bool aRecurse) const {
   NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
                "Index out of range");
   if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR;
 
   // this will classify the character, we want to treat "ignorable" characters
   // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
   nsUGenCategory charCategory =
-      mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
+      mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex));
   if (charCategory == nsUGenCategory::kLetter ||
       IsIgnorableCharacter(mDOMWordText[aIndex]) ||
       mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
       mDOMWordText[aIndex] == 0x200D /* ZWJ */)
     return CHAR_CLASS_WORD;
 
   // If conditional punctuation is surrounded immediately on both sides by word
   // characters it also counts as a word character.
@@ -583,17 +605,17 @@ bool WordSplitState<T>::IsSpecialWord() 
 
 // WordSplitState::ShouldSkipWord
 template <class T>
 bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const {
   int32_t last = aStart + aLength;
 
   // check to see if the word contains a digit
   for (int32_t i = aStart; i < last; i++) {
-    if (unicode::GetGenCategory(mDOMWordText[i]) == nsUGenCategory::kNumber) {
+    if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) == nsUGenCategory::kNumber) {
       return true;
     }
   }
 
   // not special
   return false;
 }
 
@@ -616,16 +638,28 @@ bool WordSplitState<T>::GetDOMWordSepara
       }
       *aSeparatorOffset = i;
       return true;
     }
   }
   return false;
 }
 
+template <>
+char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt(
+    int32_t aIndex) const {
+  return mDOMWordText[aIndex];
+}
+
+template <>
+char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt(
+    int32_t aIndex) const {
+  return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex]));
+}
+
 static inline bool IsBRElement(nsINode* aNode) {
   return aNode->IsHTMLElement(nsGkAtoms::br);
 }
 
 /**
  * Given a TextNode, checks to see if there's a DOM word separator before
  * aBeforeOffset within it. This function does not modify aSeparatorOffset when
  * it returns false.