bug 763703 - optimize Unicode property lookup and gfxScriptItemizer::Next. r=smontagu
authorJonathan Kew <jkew@mozilla.com>
Wed, 20 Jun 2012 20:58:18 +0100
changeset 102479 0fd12ab9899a16ab71900777a21c920d6f3c7e57
parent 102478 8e3fa5b800c589615c0f6b753ac6258e369e849a
child 102480 5c22222c03a5a187ebf691b668a75e323e86c445
push id191
push userlsblakk@mozilla.com
push dateFri, 05 Oct 2012 17:12:53 +0000
treeherdermozilla-release@ddb22ac6c03b [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssmontagu
bugs763703
milestone16.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
bug 763703 - optimize Unicode property lookup and gfxScriptItemizer::Next. r=smontagu
gfx/thebes/gfxScriptItemizer.cpp
gfx/thebes/gfxScriptItemizer.h
intl/unicharutil/tools/genUnicodePropertyData.pl
intl/unicharutil/util/nsUnicodeProperties.cpp
intl/unicharutil/util/nsUnicodeProperties.h
intl/unicharutil/util/nsUnicodePropertyData.cpp
intl/unicharutil/util/nsUnicodeScriptCodes.h
--- a/gfx/thebes/gfxScriptItemizer.cpp
+++ b/gfx/thebes/gfxScriptItemizer.cpp
@@ -43,89 +43,39 @@
  * dealings in this Software without prior written authorization of the
  * copyright holder.
  *
  * All trademarks and registered trademarks mentioned herein are the property
  * of their respective owners. 
  */
 
 #include "gfxScriptItemizer.h"
-#include "gfxFontUtils.h" // for the FindHighestBit function
 #include "nsUnicodeProperties.h"
 
 #include "nsCharTraits.h"
 
-#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
-
 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
 #define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH)
 #define INC(sp,count) (MOD((sp) + (count)))
 #define INC1(sp) (INC(sp, 1))
 #define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count)))
 #define DEC1(sp) (DEC(sp, 1))
 #define STACK_IS_EMPTY() (pushCount <= 0)
 #define STACK_IS_NOT_EMPTY() (! STACK_IS_EMPTY())
 #define TOP() (parenStack[parenSP])
 #define SYNC_FIXUP() (fixupCount = 0)
 
-
-static const PRUint16 pairedChars[] = {
-    0x0028, 0x0029, /* ascii paired punctuation */
-    0x003c, 0x003e,
-    0x005b, 0x005d,
-    0x007b, 0x007d,
-    0x00ab, 0x00bb, /* guillemets */
-    0x2018, 0x2019, /* general punctuation */
-    0x201c, 0x201d,
-    0x2039, 0x203a,
-    0x207d, 0x207e, /* superscripts and subscripts */
-    0x208d, 0x208e,
-    0x275b, 0x275c, /* dingbat quotes and brackets */
-    0x275d, 0x275e,
-    0x2768, 0x2769,
-    0x276a, 0x276b,
-    0x276c, 0x276d,
-    0x276e, 0x276f,
-    0x2770, 0x2771,
-    0x2772, 0x2773,
-    0x2774, 0x2775,
-    /* omitted: lots of potentially-paired math symbols */
-    0x2e22, 0x2e23, /* supplemental punctuation */
-    0x2e24, 0x2e25,
-    0x2e26, 0x2e27,
-    0x2e28, 0x2e29,
-    0x3008, 0x3009, /* chinese paired punctuation */
-    0x300a, 0x300b,
-    0x300c, 0x300d,
-    0x300e, 0x300f,
-    0x3010, 0x3011,
-    0x3014, 0x3015,
-    0x3016, 0x3017,
-    0x3018, 0x3019,
-    0x301a, 0x301b,
-    0xfe59, 0xfe5a, /* small form variants */
-    0xfe5b, 0xfe5c,
-    0xfe5d, 0xfe5e,
-    0xfe64, 0xfe65,
-    0xff08, 0xff09, /* half-width and full-width forms */
-    0xff1c, 0xff1e,
-    0xff3b, 0xff3d,
-    0xff5b, 0xff5d,
-    0xff5f, 0xff60,
-    0xff62, 0xff63
-};
-
 void
-gfxScriptItemizer::push(PRInt32 pairIndex, PRInt32 scriptCode)
+gfxScriptItemizer::push(PRUint32 endPairChar, PRInt32 scriptCode)
 {
     pushCount  = LIMIT_INC(pushCount);
     fixupCount = LIMIT_INC(fixupCount);
 
     parenSP = INC1(parenSP);
-    parenStack[parenSP].pairIndex  = pairIndex;
+    parenStack[parenSP].endPairChar = endPairChar;
     parenStack[parenSP].scriptCode = scriptCode;
 }
 
 void
 gfxScriptItemizer::pop()
 {
     if (STACK_IS_EMPTY()) {
         return;
@@ -152,53 +102,33 @@ gfxScriptItemizer::fixup(PRInt32 scriptC
     PRInt32 fixupSP = DEC(parenSP, fixupCount);
 
     while (fixupCount-- > 0) {
         fixupSP = INC1(fixupSP);
         parenStack[fixupSP].scriptCode = scriptCode;
     }
 }
 
-static PRInt32
-getPairIndex(PRUint32 ch)
-{
-    PRInt32 pairedCharCount = ARRAY_SIZE(pairedChars);
-    PRInt32 pairedCharPower = mozilla::FindHighestBit(pairedCharCount);
-    PRInt32 pairedCharExtra = pairedCharCount - pairedCharPower;
-
-    PRInt32 probe = pairedCharPower;
-    PRInt32 pairIndex = 0;
-
-    if (ch >= pairedChars[pairedCharExtra]) {
-        pairIndex = pairedCharExtra;
-    }
-
-    while (probe > 1) {
-        probe >>= 1;
-
-        if (ch >= pairedChars[pairIndex + probe]) {
-            pairIndex += probe;
-        }
-    }
-
-    if (pairedChars[pairIndex] != ch) {
-        pairIndex = -1;
-    }
-
-    return pairIndex;
-}
-
-static bool
-sameScript(PRInt32 runScript, PRInt32 currCharScript)
+static inline bool
+SameScript(PRInt32 runScript, PRInt32 currCharScript)
 {
     return runScript <= MOZ_SCRIPT_INHERITED ||
            currCharScript <= MOZ_SCRIPT_INHERITED ||
            currCharScript == runScript;
 }
 
+// Return whether the char has a mirrored-pair counterpart.
+// NOTE that this depends on the implementation of nsCharProps records in
+// nsUnicodeProperties, and may need to be updated if those structures change
+static inline bool
+HasMirroredChar(PRUint32 aCh)
+{
+    return GetCharProps1(aCh).mMirrorOffsetIndex != 0;
+}
+
 gfxScriptItemizer::gfxScriptItemizer(const PRUnichar *src, PRUint32 length)
     : textPtr(src), textLength(length)
 {
     reset();
 }
 
 void
 gfxScriptItemizer::SetText(const PRUnichar *src, PRUint32 length)
@@ -219,85 +149,87 @@ gfxScriptItemizer::Next(PRUint32& aRunSt
     }
 
     SYNC_FIXUP();
     scriptCode = MOZ_SCRIPT_COMMON;
 
     for (scriptStart = scriptLimit; scriptLimit < textLength; scriptLimit += 1) {
         PRUint32 ch;
         PRInt32 sc;
-        PRInt32 pairIndex;
         PRUint32 startOfChar = scriptLimit;
 
         ch = textPtr[scriptLimit];
 
-        /*
-         * MODIFICATION for Gecko - clear the paired-character stack
-         * when we see a space character, because we cannot trust
-         * context outside the current "word" when doing textrun
-         * construction
-         */
-        if (ch == 0x20) {
-            while (STACK_IS_NOT_EMPTY()) {
-                pop();
+        /* decode UTF-16 (may be surrogate pair) */
+        if (NS_IS_HIGH_SURROGATE(ch) && scriptLimit < textLength - 1) {
+            PRUint32 low = textPtr[scriptLimit + 1];
+            if (NS_IS_LOW_SURROGATE(low)) {
+                ch = SURROGATE_TO_UCS4(ch, low);
+                scriptLimit += 1;
             }
-            sc = MOZ_SCRIPT_COMMON;
-            pairIndex = -1;
-        } else {
-            /* decode UTF-16 (may be surrogate pair) */
-            if (NS_IS_HIGH_SURROGATE(ch) && scriptLimit < textLength - 1) {
-                PRUint32 low = textPtr[scriptLimit + 1];
-                if (NS_IS_LOW_SURROGATE(low)) {
-                    ch = SURROGATE_TO_UCS4(ch, low);
-                    scriptLimit += 1;
-                }
-            }
+        }
 
-            sc = mozilla::unicode::GetScriptCode(ch);
+        // Get the nsCharProps2 record for the current character,
+        // so we can read the script and (if needed) the gen category
+        // without needing to do two multi-level lookups.
+        // NOTE that this means we're relying on an implementation detail
+        // of the nsUnicodeProperties tables, and might have to revise this
+        // if the nsCharProps records used there are modified in future.
+        const nsCharProps2& charProps = GetCharProps2(ch);
 
-            pairIndex = getPairIndex(ch);
+        // Initialize gc to UNASSIGNED; we'll only set it to the true GC
+        // if the character has script=COMMON, otherwise we don't care.
+        PRUint8 gc = HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 
+        sc = charProps.mScriptCode;
+        if (sc == MOZ_SCRIPT_COMMON) {
             /*
              * Paired character handling:
              *
              * if it's an open character, push it onto the stack.
              * if it's a close character, find the matching open on the
              * stack, and use that script code. Any non-matching open
-             * characters above it on the stack will be poped.
+             * characters above it on the stack will be popped.
+             *
+             * We only do this if the script is COMMON; for chars with
+             * specific script assignments, we just use them as-is.
              */
-            if (pairIndex >= 0) {
-                if ((pairIndex & 1) == 0) {
-                    push(pairIndex, scriptCode);
-                } else {
-                    PRInt32 pi = pairIndex & ~1;
+            gc = charProps.mCategory;
+            if (gc == HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION) {
+                PRUint32 endPairChar = mozilla::unicode::GetMirroredChar(ch);
+                if (endPairChar != ch) {
+                    push(endPairChar, scriptCode);
+                }
+            } else if (gc == HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION &&
+                HasMirroredChar(ch))
+            {
+                while (STACK_IS_NOT_EMPTY() && TOP().endPairChar != ch) {
+                    pop();
+                }
 
-                    while (STACK_IS_NOT_EMPTY() && TOP().pairIndex != pi) {
-                        pop();
-                    }
-
-                    if (STACK_IS_NOT_EMPTY()) {
-                        sc = TOP().scriptCode;
-                    }
+                if (STACK_IS_NOT_EMPTY()) {
+                    sc = TOP().scriptCode;
                 }
             }
         }
 
-        if (sameScript(scriptCode, sc)) {
+        if (SameScript(scriptCode, sc)) {
             if (scriptCode <= MOZ_SCRIPT_INHERITED &&
                 sc > MOZ_SCRIPT_INHERITED)
             {
                 scriptCode = sc;
                 fixup(scriptCode);
             }
 
             /*
              * if this character is a close paired character,
              * pop the matching open character from the stack
              */
-            if (pairIndex >= 0 && (pairIndex & 1) != 0) {
+            if (gc == HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION &&
+                HasMirroredChar(ch)) {
                 pop();
             }
         } else {
             /*
              * reset scriptLimit in case it was advanced during reading a
              * multiple-code-unit character
              */
             scriptLimit = startOfChar;
--- a/gfx/thebes/gfxScriptItemizer.h
+++ b/gfx/thebes/gfxScriptItemizer.h
@@ -72,23 +72,23 @@ protected:
         scriptStart = 0;
         scriptLimit = 0;
         scriptCode  = MOZ_SCRIPT_INVALID;
         parenSP     = -1;
         pushCount   =  0;
         fixupCount  =  0;
     }
 
-    void push(PRInt32 pairIndex, PRInt32 scriptCode);
+    void push(PRUint32 endPairChar, PRInt32 scriptCode);
     void pop();
     void fixup(PRInt32 scriptCode);
 
     struct ParenStackEntry {
-        PRInt32 pairIndex;
-        PRInt32 scriptCode;
+        PRUint32 endPairChar;
+        PRInt32  scriptCode;
     };
 
     const PRUnichar *textPtr;
     PRUint32 textLength;
 
     PRUint32 scriptStart;
     PRUint32 scriptLimit;
     PRInt32  scriptCode;
--- a/intl/unicharutil/tools/genUnicodePropertyData.pl
+++ b/intl/unicharutil/tools/genUnicodePropertyData.pl
@@ -617,16 +617,17 @@ print HEADER <<__END;
 $versionInfo
 
  *
  * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
  */
 
 #ifndef NS_UNICODE_SCRIPT_CODES
 #define NS_UNICODE_SCRIPT_CODES
+
 __END
 
 print DATA_TABLES "static const PRUint32 sScriptCodeToTag[] = {\n";
 for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) {
   printf DATA_TABLES "  HB_TAG(%s)", $scriptCodeToTag[$i];
   print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n";
 }
 print DATA_TABLES "};\n\n";
@@ -635,34 +636,38 @@ our $totalData = 0;
 
 print DATA_TABLES "static const PRInt16 sMirrorOffsets[] = {\n";
 for (my $i = 0; $i < scalar @offsets; ++$i) {
     printf DATA_TABLES "  $offsets[$i]";
     print DATA_TABLES $i < $#offsets ? ",\n" : "\n";
 }
 print DATA_TABLES "};\n\n";
 
+print HEADER "#pragma pack(1)\n\n";
+
 sub sprintCharProps1
 {
   my $usv = shift;
   return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]);
 }
-&genTables("CharProp1", "struct nsCharProps1 {\n  unsigned char  mMirrorOffsetIndex:5;\n  unsigned char mHangulType:3;\n  unsigned char mCombiningClass:8;\n};",
+&genTables("CharProp1", "struct nsCharProps1 {\n  unsigned char mMirrorOffsetIndex:5;\n  unsigned char mHangulType:3;\n  unsigned char mCombiningClass:8;\n};",
            "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
 
 sub sprintCharProps2
 {
   my $usv = shift;
   return sprintf("{%d,%d,%d,%d,%d,%d},",
                  $script[$usv], $eaw[$usv], $category[$usv],
                  $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv]);
 }
 &genTables("CharProp2", "struct nsCharProps2 {\n  unsigned char mScriptCode:8;\n  unsigned char mEAW:3;\n  unsigned char mCategory:5;\n  unsigned char mBidiCategory:5;\n  unsigned char mXidmod:4;\n  signed char mNumericValue:5;\n  unsigned char mHanVariant:2;\n};",
            "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);
 
+print HEADER "#pragma pack()\n\n";
+
 sub sprintHanVariants
 {
   my $baseUsv = shift;
   my $varShift = 0;
   my $val = 0;
   while ($varShift < 8) {
     $val |= $hanVariant[$baseUsv++] << $varShift;
     $varShift += 2;
--- a/intl/unicharutil/util/nsUnicodeProperties.cpp
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@@ -11,58 +11,61 @@
 #include "mozilla/Util.h"
 #include "nsMemory.h"
 #include "nsCharTraits.h"
 
 #define UNICODE_BMP_LIMIT 0x10000
 #define UNICODE_LIMIT     0x110000
 
 
-nsCharProps1
+const nsCharProps1&
 GetCharProps1(PRUint32 aCh)
 {
     if (aCh < UNICODE_BMP_LIMIT) {
         return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]]
                                [aCh & ((1 << kCharProp1CharBits) - 1)];
     }
     if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) {
         return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]]
                                                [(aCh & 0xffff) >> kCharProp1CharBits]]
                                [aCh & ((1 << kCharProp1CharBits) - 1)];
     }
 
     // Default values for unassigned
-    nsCharProps1 undefined = {0,       // Index to mirrored char offsets
-                              0,       // Hangul Syllable type
-                              0};      // Combining class
+    static const nsCharProps1 undefined = {
+        0,       // Index to mirrored char offsets
+        0,       // Hangul Syllable type
+        0        // Combining class
+    };
     return undefined;
 }
 
-nsCharProps2
+const nsCharProps2&
 GetCharProps2(PRUint32 aCh)
 {
     if (aCh < UNICODE_BMP_LIMIT) {
         return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
                               [aCh & ((1 << kCharProp2CharBits) - 1)];
     }
     if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
         return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
                                                [(aCh & 0xffff) >> kCharProp2CharBits]]
                                [aCh & ((1 << kCharProp2CharBits) - 1)];
     }
 
     NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
     // Default values for unassigned
-    nsCharProps2 undefined = {
+    static const nsCharProps2 undefined = {
         MOZ_SCRIPT_UNKNOWN,                      // Script code
         0,                                       // East Asian Width
         HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,  // General Category
         eCharType_LeftToRight,                   // Bidi Category
         mozilla::unicode::XIDMOD_NOT_CHARS,      // Xidmod
-        -1                                       // Numeric Value
+        -1,                                      // Numeric Value
+        mozilla::unicode::HVT_NotHan             // Han variant
     };
     return undefined;
 }
 
 namespace mozilla {
 
 namespace unicode {
 
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@@ -6,18 +6,18 @@
 #ifndef NS_UNICODEPROPERTIES_H
 #define NS_UNICODEPROPERTIES_H
 
 #include "prtypes.h"
 #include "nsBidiUtils.h"
 #include "nsIUGenCategory.h"
 #include "nsUnicodeScriptCodes.h"
 
-nsCharProps1 GetCharProps1(PRUint32 aCh);
-nsCharProps2 GetCharProps2(PRUint32 aCh);
+const nsCharProps1& GetCharProps1(PRUint32 aCh);
+const nsCharProps2& GetCharProps2(PRUint32 aCh);
 
 namespace mozilla {
 
 namespace unicode {
 
 extern nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[];
 
 PRUint32 GetMirroredChar(PRUint32 aCh);
--- a/intl/unicharutil/util/nsUnicodePropertyData.cpp
+++ b/intl/unicharutil/util/nsUnicodePropertyData.cpp
@@ -1,16 +1,22 @@
 
-/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
- * This Source Code Form is subject to the terms of the Mozilla Public
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 /*
- * Created on Mon Apr 23 20:03:29 2012 from UCD data files with version info:
+ * Derived from the Unicode Character Database by genUnicodePropertyData.pl
+ *
+ * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
+ */
+
+/*
+ * Created on Mon Jun 11 21:04:54 2012 from UCD data files with version info:
  *
 
 # Date: 2012-01-26, 22:03:00 GMT [KW]
 #
 # Unicode Character Database
 # Copyright (c) 1991-2012 Unicode, Inc.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
--- a/intl/unicharutil/util/nsUnicodeScriptCodes.h
+++ b/intl/unicharutil/util/nsUnicodeScriptCodes.h
@@ -1,16 +1,22 @@
 
-/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
- * This Source Code Form is subject to the terms of the Mozilla Public
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 /*
- * Created on Mon Apr 23 20:03:29 2012 from UCD data files with version info:
+ * Derived from the Unicode Character Database by genUnicodePropertyData.pl
+ *
+ * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
+ */
+
+/*
+ * Created on Mon Jun 11 21:04:54 2012 from UCD data files with version info:
  *
 
 # Date: 2012-01-26, 22:03:00 GMT [KW]
 #
 # Unicode Character Database
 # Copyright (c) 1991-2012 Unicode, Inc.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
@@ -43,32 +49,37 @@ for the Unicode Character Database (UCD)
 # Date: 2011-08-08 22:10:53 GMT [JHJ]
 
  *
  * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
  */
 
 #ifndef NS_UNICODE_SCRIPT_CODES
 #define NS_UNICODE_SCRIPT_CODES
+
+#pragma pack(1)
+
 struct nsCharProps1 {
-  unsigned char  mMirrorOffsetIndex:5;
+  unsigned char mMirrorOffsetIndex:5;
   unsigned char mHangulType:3;
   unsigned char mCombiningClass:8;
 };
 
 struct nsCharProps2 {
   unsigned char mScriptCode:8;
   unsigned char mEAW:3;
   unsigned char mCategory:5;
   unsigned char mBidiCategory:5;
   unsigned char mXidmod:4;
   signed char mNumericValue:5;
   unsigned char mHanVariant:2;
 };
 
+#pragma pack()
+
 enum {
   MOZ_SCRIPT_COMMON = 0,
   MOZ_SCRIPT_INHERITED = 1,
   MOZ_SCRIPT_ARABIC = 2,
   MOZ_SCRIPT_ARMENIAN = 3,
   MOZ_SCRIPT_BENGALI = 4,
   MOZ_SCRIPT_BOPOMOFO = 5,
   MOZ_SCRIPT_CHEROKEE = 6,