[JAEGER] Use SSE4.1 when available to load doubles faster. b=582785, r=dvander.
authorAlex Miller <amiller@mozilla.com>
Thu, 12 Aug 2010 21:43:39 -0700
changeset 53412 a16f3ba4b87ce230ea6bdb028f549ab9178257f9
parent 53411 dd5ba3af39e8e23888094fa12362d72c1272b833
child 53413 2808aa75ff1f230a70504263b8a6c331a4060b4b
push idunknown
push userunknown
push dateunknown
reviewersdvander
bugs582785
milestone2.0b4pre
[JAEGER] Use SSE4.1 when available to load doubles faster. b=582785, r=dvander.
js/src/assembler/assembler/MacroAssemblerX86Common.cpp
js/src/assembler/assembler/MacroAssemblerX86Common.h
js/src/assembler/assembler/X86Assembler.h
js/src/methodjit/BaseAssembler.h
--- a/js/src/assembler/assembler/MacroAssemblerX86Common.cpp
+++ b/js/src/assembler/assembler/MacroAssemblerX86Common.cpp
@@ -1,11 +1,51 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=4 sw=4 et tw=99:
+ *
+ * ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla SpiderMonkey JavaScript 1.9 code, released
+ * May 28, 2008.
+ *
+ * The Initial Developer of the Original Code is
+ *   Mozilla Corporation
+ *
+ * Contributor(s):
+ *   Alex Miller <amiller@mozilla.com>
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either of the GNU General Public License Version 2 or later (the "GPL"),
+ * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
 #include "assembler/wtf/Platform.h"
 
-#if WTF_CPU_X86 && !WTF_PLATFORM_MAC
+/* SSE checks only make sense on Intel platforms. */
+#if WTF_CPU_X86 || WTF_CPU_X86_64
 
 #include "MacroAssemblerX86Common.h"
 
 using namespace JSC;
+MacroAssemblerX86Common::SSECheckState MacroAssemblerX86Common::s_sseCheckState = NotCheckedSSE;
 
-MacroAssemblerX86Common::SSE2CheckState MacroAssemblerX86Common::s_sse2CheckState = NotCheckedSSE2;
+#endif /* WTF_CPU_X86 || WTF_CPU_X86_64 */
 
-#endif
--- a/js/src/assembler/assembler/MacroAssemblerX86Common.h
+++ b/js/src/assembler/assembler/MacroAssemblerX86Common.h
@@ -1069,89 +1069,126 @@ public:
         if (mask.m_value == -1)
             m_assembler.cmpl_im(0, address.offset, address.base);
         else
             m_assembler.testl_i32m(mask.m_value, address.offset, address.base);
         m_assembler.setCC_r(x86Condition(cond), dest);
         m_assembler.movzbl_rr(dest, dest);
     }
 
+    enum SSECheckState {
+        NotCheckedSSE = 0,
+        NoSSE = 1,
+        HasSSE2 = 2,
+        HasSSE4_1 = 3 // implies HasSSE2
+    };
+
+    static SSECheckState getSSEState()
+    {
+        if (s_sseCheckState == NotCheckedSSE) {
+            MacroAssemblerX86Common::setSSECheckState();
+        }
+        // Only check once.
+        ASSERT(s_sseCheckState != NotCheckedSSE);
+
+        return s_sseCheckState;
+    }
+
 protected:
     X86Assembler::Condition x86Condition(Condition cond)
     {
         return static_cast<X86Assembler::Condition>(cond);
     }
 
 private:
-    // Only MacroAssemblerX86 should be using the following method; SSE2 is always available on
-    // x86_64, and clients & subclasses of MacroAssembler should be using 'supportsFloatingPoint()'.
     friend class MacroAssemblerX86;
 
+    static SSECheckState s_sseCheckState;
+
+    static void setSSECheckState()
+    {
+        // Default the flags value to zero; if the compiler is
+        // not MSVC or GCC we will read this as SSE2 not present.
+        int flags_edx = 0;
+        int flags_ecx = 0;
+#if WTF_COMPILER_MSVC
+        _asm {
+            mov eax, 1 // cpuid function 1 gives us the standard feature set
+            cpuid;
+            mov flags_ecx, ecx;
+            mov flags_edx, edx;
+        }
+#elif WTF_COMPILER_GCC
+        asm (
+             "movl $0x1, %%eax;"
+             "pushl %%ebx;"
+             "cpuid;"
+             "popl %%ebx;"
+             "movl %%ecx, %0;"
+             "movl %%edx, %1;"
+             : "=g" (flags_ecx), "=g" (flags_edx)
+             :
+             : "%eax", "%ecx", "%edx"
+             );
+#endif
+        static const int SSE2FeatureBit = 1 << 26;
+        static const int SSE41FeatureBit = 1 << 19;
+        if (flags_ecx & SSE41FeatureBit)
+            s_sseCheckState = HasSSE4_1;
+        else if (flags_edx & SSE2FeatureBit)
+            s_sseCheckState = HasSSE2;
+        else
+            s_sseCheckState = NoSSE;
+    }
+
 #if WTF_CPU_X86
 #if WTF_PLATFORM_MAC
 
     // All X86 Macs are guaranteed to support at least SSE2,
     static bool isSSE2Present()
     {
         return true;
     }
 
 #else // PLATFORM(MAC)
 
-    enum SSE2CheckState {
-        NotCheckedSSE2,
-        HasSSE2,
-        NoSSE2
-    };
-
     static bool isSSE2Present()
     {
-        if (s_sse2CheckState == NotCheckedSSE2) {
-            // Default the flags value to zero; if the compiler is
-            // not MSVC or GCC we will read this as SSE2 not present.
-            int flags = 0;
-#if WTF_COMPILER_MSVC
-            _asm {
-                mov eax, 1 // cpuid function 1 gives us the standard feature set
-                cpuid;
-                mov flags, edx;
-            }
-#elif WTF_COMPILER_GCC
-            asm (
-                 "movl $0x1, %%eax;"
-                 "pushl %%ebx;"
-                 "cpuid;"
-                 "popl %%ebx;"
-                 "movl %%edx, %0;"
-                 : "=g" (flags)
-                 :
-                 : "%eax", "%ecx", "%edx"
-                 );
-#endif
-            static const int SSE2FeatureBit = 1 << 26;
-            s_sse2CheckState = (flags & SSE2FeatureBit) ? HasSSE2 : NoSSE2;
+        if (s_sseCheckState == NotCheckedSSE) {
+            setSSECheckState();
         }
         // Only check once.
-        ASSERT(s_sse2CheckState != NotCheckedSSE2);
+        ASSERT(s_sseCheckState != NotCheckedSSE);
 
-        return s_sse2CheckState == HasSSE2;
+        return s_sseCheckState >= HasSSE2;
     }
     
-    static SSE2CheckState s_sse2CheckState;
 
 #endif // PLATFORM(MAC)
 #elif !defined(NDEBUG) // CPU(X86)
 
     // On x86-64 we should never be checking for SSE2 in a non-debug build,
     // but non debug add this method to keep the asserts above happy.
     static bool isSSE2Present()
     {
         return true;
     }
 
 #endif
+
+    static bool isSSE41Present()
+    {
+        if (s_sseCheckState == NotCheckedSSE) {
+            setSSECheckState();
+        }
+        // Only check once.
+        ASSERT(s_sseCheckState != NotCheckedSSE);
+
+        return s_sseCheckState == HasSSE4_1;
+    }
+
 };
 
 } // namespace JSC
 
 #endif // ENABLE(ASSEMBLER)
 
 #endif // MacroAssemblerX86Common_h
--- a/js/src/assembler/assembler/X86Assembler.h
+++ b/js/src/assembler/assembler/X86Assembler.h
@@ -176,16 +176,17 @@ public:
 
 private:
     typedef enum {
         OP_ADD_EvGv                     = 0x01,
         OP_ADD_GvEv                     = 0x03,
         OP_OR_EvGv                      = 0x09,
         OP_OR_GvEv                      = 0x0B,
         OP_2BYTE_ESCAPE                 = 0x0F,
+        OP_3BYTE_ESCAPE                 = 0x3A,
         OP_AND_EvGv                     = 0x21,
         OP_AND_GvEv                     = 0x23,
         OP_SUB_EvGv                     = 0x29,
         OP_SUB_GvEv                     = 0x2B,
         PRE_PREDICT_BRANCH_NOT_TAKEN    = 0x2E,
         OP_XOR_EvGv                     = 0x31,
         OP_XOR_GvEv                     = 0x33,
         OP_CMP_EvGv                     = 0x39,
@@ -253,16 +254,20 @@ private:
         OP2_JCC_rel32       = 0x80,
         OP_SETCC            = 0x90,
         OP2_IMUL_GvEv       = 0xAF,
         OP2_MOVZX_GvEb      = 0xB6,
         OP2_MOVZX_GvEw      = 0xB7,
         OP2_PEXTRW_GdUdIb   = 0xC5
     } TwoByteOpcodeID;
 
+    typedef enum {
+        OP3_PINSRD_VsdWsd   = 0x22
+    } ThreeByteOpcodeID;
+
     TwoByteOpcodeID jccRel32(Condition cond)
     {
         return (TwoByteOpcodeID)(OP2_JCC_rel32 + cond);
     }
 
     TwoByteOpcodeID setccOpcode(Condition cond)
     {
         return (TwoByteOpcodeID)(OP_SETCC + cond);
@@ -2001,16 +2006,26 @@ public:
     {
         js::JaegerSpew(js::JSpew_Insns,
                        IPFX "sqrtsd     %s, %s\n", MAYBE_PAD,
                        nameFPReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_F2);
         m_formatter.twoByteOp(OP2_SQRTSD_VsdWsd, (RegisterID)dst, (RegisterID)src);
     }
 
+    void pinsrd_rr(RegisterID src, XMMRegisterID dst)
+    {
+        js::JaegerSpew(js::JSpew_Insns,
+                       IPFX "pinsrd     $1, %s, %s\n", MAYBE_PAD,
+                       nameIReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_PINSRD_VsdWsd, (RegisterID)dst, (RegisterID)src);
+        m_formatter.immediate8(0x01); // the $1
+    }
+
     // Misc instructions:
 
     void int3()
     {
         js::JaegerSpew(js::JSpew_Insns, IPFX "int3\n", MAYBE_PAD);
         m_formatter.oneByteOp(OP_INT3);
     }
     
@@ -2356,16 +2371,26 @@ private:
         {
             m_buffer.ensureSpace(maxInstructionSize);
             m_buffer.putByteUnchecked(OP_2BYTE_ESCAPE);
             m_buffer.putByteUnchecked(opcode);
             memoryModRM(reg, address);
         }
 #endif
 
+        void threeByteOp(ThreeByteOpcodeID opcode, int reg, RegisterID rm)
+        {
+            m_buffer.ensureSpace(maxInstructionSize);
+            emitRexIfNeeded(reg, 0, rm);
+            m_buffer.putByteUnchecked(OP_2BYTE_ESCAPE);
+            m_buffer.putByteUnchecked(OP_3BYTE_ESCAPE);
+            m_buffer.putByteUnchecked(opcode);
+            registerModRM(reg, rm);
+        }
+
 #if WTF_CPU_X86_64
         // Quad-word-sized operands:
         //
         // Used to format 64-bit operantions, planting a REX.w prefix.
         // When planting d64 or f64 instructions, not requiring a REX.w prefix,
         // the normal (non-'64'-postfixed) formatters should be used.
 
         void oneByteOp64(OneByteOpcodeID opcode)
--- a/js/src/methodjit/BaseAssembler.h
+++ b/js/src/methodjit/BaseAssembler.h
@@ -32,16 +32,17 @@
  * under the terms of either the GPL or the LGPL, and not to allow others to
  * use your version of this file under the terms of the MPL, indicate your
  * decision by deleting the provisions above and replace them with the notice
  * and other provisions required by the GPL or the LGPL. If you do not delete
  * the provisions above, a recipient may use your version of this file under
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
+
 #if !defined jsjaeger_baseassembler_h__ && defined JS_METHODJIT
 #define jsjaeger_baseassembler_h__
 
 #include "jscntxt.h"
 #include "jstl.h"
 #include "assembler/assembler/MacroAssemblerCodeRef.h"
 #include "assembler/assembler/MacroAssembler.h"
 #include "assembler/assembler/RepatchBuffer.h"
@@ -176,19 +177,24 @@ static const JSC::MacroAssembler::Regist
 
 #ifdef JS_CPU_X86
     void idiv(RegisterID reg) {
         m_assembler.cdq();
         m_assembler.idivl_r(reg);
     }
 
     void fastLoadDouble(RegisterID lo, RegisterID hi, FPRegisterID fpReg) {
-        m_assembler.movd_rr(lo, fpReg);
-        m_assembler.movd_rr(hi, FPRegisters::Temp0);
-        m_assembler.unpcklps_rr(FPRegisters::Temp0, fpReg);
+        if (MacroAssemblerX86Common::getSSEState() >= HasSSE4_1) {
+            m_assembler.movd_rr(lo, fpReg);
+            m_assembler.pinsrd_rr(hi, fpReg);
+        } else {
+            m_assembler.movd_rr(lo, fpReg);
+            m_assembler.movd_rr(hi, FPRegisters::Temp0);
+            m_assembler.unpcklps_rr(FPRegisters::Temp0, fpReg);
+        }
     }
 #endif
 
     /*
      * Prepares for a stub call.
      */
     void * getCallTarget(void *fun) {
 #ifdef JS_CPU_ARM