author | Makoto Kato <m_kato@ga2.so-net.ne.jp> |
Thu, 26 Dec 2019 03:34:49 +0000 (2019-12-26) | |
changeset 508373 | 8ae790b27fe0beb911aa0e050579f6c62eef56d1 |
parent 508372 | 7a8425c45bc2c40753be9675298a652ccc8a1242 |
child 508374 | bd3843488a457fa721807dfe6d5f443b1c6d40f0 |
push id | 36952 |
push user | ncsoregi@mozilla.com |
push date | Thu, 26 Dec 2019 09:26:13 +0000 (2019-12-26) |
treeherder | mozilla-central@bd3843488a45 [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
reviewers | masayuki |
bugs | 1602526, 1571339 |
milestone | 73.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
extensions/spellcheck/src/mozInlineSpellWordUtil.cpp | file | annotate | diff | comparison | revisions |
--- a/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp +++ b/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp @@ -23,46 +23,66 @@ #include <algorithm> using namespace mozilla; // IsIgnorableCharacter // // These characters are ones that we should ignore in input. +inline bool IsIgnorableCharacter(char ch) { + return (ch == static_cast<char>(0xAD)); // SOFT HYPHEN +} + inline bool IsIgnorableCharacter(char16_t ch) { return (ch == 0xAD || // SOFT HYPHEN ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN } // IsConditionalPunctuation // // Some characters (like apostrophes) require characters on each side to be // part of a word, and are otherwise punctuation. +inline bool IsConditionalPunctuation(char ch) { + return (ch == '\'' || // RIGHT SINGLE QUOTATION MARK + ch == static_cast<char>(0xB7)); // MIDDLE DOT +} + inline bool IsConditionalPunctuation(char16_t ch) { return (ch == '\'' || ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK ch == 0x00B7); // MIDDLE DOT } static bool IsAmbiguousDOMWordSeprator(char16_t ch) { // This class may be CHAR_CLASS_SEPARATOR, but it depends on context. return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' || IsConditionalPunctuation(ch)); } +static bool IsAmbiguousDOMWordSeprator(char ch) { + // This class may be CHAR_CLASS_SEPARATOR, but it depends on context. + return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch)); +} + // IsDOMWordSeparator // // Determines if the given character should be considered as a DOM Word // separator. Basically, this is whitespace, although it could also have // certain punctuation that we know ALWAYS breaks words. This is important. // For example, we can't have any punctuation that could appear in a URL // or email address in this, because those need to always fit into a single // DOM word. +static bool IsDOMWordSeparator(char ch) { + // simple spaces or no-break space + return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || + ch == static_cast<char>(0xA0)); +} + static bool IsDOMWordSeparator(char16_t ch) { // simple spaces if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true; // complex spaces - check only if char isn't ASCII (uncommon) if (ch >= 0xA0 && (ch == 0x00A0 || // NO-BREAK SPACE ch == 0x2002 || // EN SPACE ch == 0x2003 || // EM SPACE @@ -409,30 +429,32 @@ struct MOZ_STACK_CLASS WordSplitState { // input. This checks for things that do not require special word-breaking // rules. bool ShouldSkipWord(int32_t aStart, int32_t aLength) const; // Checks to see if there's a DOM word separator before aBeforeOffset within // it. This function does not modify aSeparatorOffset when it returns false. bool GetDOMWordSeparatorOffset(int32_t aOffset, int32_t* aSeparatorOffset) const; + + char16_t GetUnicharAt(int32_t aIndex) const; }; // WordSplitState::ClassifyCharacter template <class T> CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex, bool aRecurse) const { NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()), "Index out of range"); if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR; // this will classify the character, we want to treat "ignorable" characters // such as soft hyphens, and also ZWJ and ZWNJ as word characters. nsUGenCategory charCategory = - mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]); + mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex)); if (charCategory == nsUGenCategory::kLetter || IsIgnorableCharacter(mDOMWordText[aIndex]) || mDOMWordText[aIndex] == 0x200C /* ZWNJ */ || mDOMWordText[aIndex] == 0x200D /* ZWJ */) return CHAR_CLASS_WORD; // If conditional punctuation is surrounded immediately on both sides by word // characters it also counts as a word character. @@ -583,17 +605,17 @@ bool WordSplitState<T>::IsSpecialWord() // WordSplitState::ShouldSkipWord template <class T> bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const { int32_t last = aStart + aLength; // check to see if the word contains a digit for (int32_t i = aStart; i < last; i++) { - if (unicode::GetGenCategory(mDOMWordText[i]) == nsUGenCategory::kNumber) { + if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) == nsUGenCategory::kNumber) { return true; } } // not special return false; } @@ -616,16 +638,28 @@ bool WordSplitState<T>::GetDOMWordSepara } *aSeparatorOffset = i; return true; } } return false; } +template <> +char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt( + int32_t aIndex) const { + return mDOMWordText[aIndex]; +} + +template <> +char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt( + int32_t aIndex) const { + return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex])); +} + static inline bool IsBRElement(nsINode* aNode) { return aNode->IsHTMLElement(nsGkAtoms::br); } /** * Given a TextNode, checks to see if there's a DOM word separator before * aBeforeOffset within it. This function does not modify aSeparatorOffset when * it returns false.