Bug 487900 - Enable SSE2 code for x86_64 in qcms. r=jmuizelaar
authorMakoto Kato <m_kato@ga2.so-net.ne.jp>
Wed, 15 Apr 2009 22:14:03 +0900
changeset 27354 9a76e03c8ee61a89d50ddc928407e5956657006f
parent 27353 58460d86223d1def37dadeedd301a839ef31d57c
child 27355 fdcb163df592c4eef6b2ba9a7ac0c5baeb4fce64
push id6512
push userm_kato@ga2.so-net.ne.jp
push dateWed, 15 Apr 2009 13:14:42 +0000
treeherdermozilla-central@9a76e03c8ee6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjmuizelaar
bugs487900
milestone1.9.2a1pre
Bug 487900 - Enable SSE2 code for x86_64 in qcms. r=jmuizelaar
gfx/qcms/transform.c
--- a/gfx/qcms/transform.c
+++ b/gfx/qcms/transform.c
@@ -20,17 +20,17 @@
 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include <stdlib.h>
 #include <math.h>
 #include <assert.h>
 #include "qcmsint.h"
 
-#if defined(_M_IX86) || defined(__i386__)
+#if defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64)
 #define X86
 #endif
 
 //XXX: could use a bettername
 typedef uint16_t uint16_fract_t;
 
 /* value must be a value between 0 and 1 */
 //XXX: is the above a good restriction to have?
@@ -755,16 +755,21 @@ void qcms_transform_data_rgb_out_lut_sse
 		_mm_store_si128((__m128i*)input, out);
 
 		*dest++ = transform->output_table_r->data[output[0]];
 		*dest++ = transform->output_table_g->data[output[1]];
 		*dest++ = transform->output_table_b->data[output[2]];
 	}
 }
 #endif
+
+#if defined(_MSC_VER) && defined(_M_AMD64)
+#include <emmintrin.h>
+#endif
+
 static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 {
 	int i;
 	float (*mat)[4] = transform->matrix;
         char input_back[32];
 	/* Ensure we have a buffer that's 16 byte aligned regardless of the original
 	 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
 	 * because they don't work on stack variables. gcc 4.4 does do the right thing 
@@ -822,17 +827,17 @@ static void qcms_transform_data_rgb_out_
                       : "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
                       : "memory"
 /* older versions of gcc don't know about these registers so only include them as constraints
    if gcc knows about them */
 #ifdef __SSE2__
                         , "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
 #endif
                       );
-#else
+#elif defined(_MSC_VER) && defined(_M_IX86)
                 __asm {
                       mov      eax, mat
                       mov      ecx, clampMax
                       mov      edx, floatScaleAddr
 		      mov      ebx, input
 
                       movaps   xmm1, [eax]
                       movaps   xmm2, [eax + 16]
@@ -858,16 +863,43 @@ static void qcms_transform_data_rgb_out_
                       xorps    xmm6, xmm6
                       maxps    xmm1, xmm6
                       movss    xmm5, [edx]
                       shufps   xmm5, xmm5, 0
                       mulps    xmm1, xmm5
                       cvtps2dq xmm1, xmm1
                       movdqa   [ebx], xmm1
                 }
+#elif defined(_MSC_VER) && defined(_M_AMD64)
+                {
+                        __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
+
+                        xmm1 = _mm_load_ps((__m128*)mat);
+                        xmm2 = _mm_load_ps(((__m128*)mat) + 1);
+                        xmm3 = _mm_load_ps(((__m128*)mat) + 2);
+                        xmm0 = _mm_load_ps((__m128*)input);
+
+                        xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
+                        xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
+                        xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
+
+                        xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
+
+                        xmm7 = _mm_load_ss(clampMax);
+                        xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
+                        xmm1 = _mm_min_ps(xmm1, xmm7);
+                        xmm6 = _mm_xor_ps(xmm6, xmm6);
+                        xmm1 = _mm_max_ps(xmm1, xmm6);
+                        xmm5 = _mm_load_ss(&floatScale);
+                        xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
+                        xmm1 = _mm_mul_ps(xmm1, xmm5);
+                        _mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
+                }
+#else
+#error "Unknown platform"
 #endif
 
 		*dest++ = transform->output_table_r->data[output[0]];
 		*dest++ = transform->output_table_g->data[output[1]];
 		*dest++ = transform->output_table_b->data[output[2]];
 	}
 }
 
@@ -931,17 +963,17 @@ static void qcms_transform_data_rgba_out
                       : "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
                       : "memory"
 /* older versions of gcc don't know about these registers so only include them as constraints
    if gcc knows about them */
 #ifdef __SSE2__
                         , "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
 #endif
                       );
-#else
+#elif defined(_MSC_VER) && defined(_M_IX86)
                 __asm {
                       mov      eax, mat
                       mov      ecx, clampMax
                       mov      edx, floatScaleAddr
 		      mov      ebx, input
 
                       movaps   xmm1, [eax]
                       movaps   xmm2, [eax + 16]
@@ -967,16 +999,43 @@ static void qcms_transform_data_rgba_out
                       xorps    xmm6, xmm6
                       maxps    xmm1, xmm6
                       movss    xmm5, [edx]
                       shufps   xmm5, xmm5, 0
                       mulps    xmm1, xmm5
                       cvtps2dq xmm1, xmm1
                       movdqa   [ebx], xmm1
                 }
+#elif defined(_MSC_VER) && defined(_M_AMD64)
+                {
+                        __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
+
+                        xmm1 = _mm_load_ps((__m128*)mat);
+                        xmm2 = _mm_load_ps(((__m128*)mat) + 1);
+                        xmm3 = _mm_load_ps(((__m128*)mat) + 2);
+                        xmm0 = _mm_load_ps((__m128*)input);
+
+                        xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
+                        xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
+                        xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
+
+                        xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
+
+                        xmm7 = _mm_load_ss(clampMax);
+                        xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
+                        xmm1 = _mm_min_ps(xmm1, xmm7);
+                        xmm6 = _mm_xor_ps(xmm6, xmm6);
+                        xmm1 = _mm_max_ps(xmm1, xmm6);
+                        xmm5 = _mm_load_ss(&floatScale);
+                        xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
+                        xmm1 = _mm_mul_ps(xmm1, xmm5);
+                        _mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
+                }
+#else
+#error "Unknown platform"
 #endif
 
 		*dest++ = transform->output_table_r->data[output[0]];
 		*dest++ = transform->output_table_g->data[output[1]];
 		*dest++ = transform->output_table_b->data[output[2]];
 		*dest++ = alpha;
 	}
 }
@@ -1318,17 +1377,19 @@ static void cpuid(uint32_t fxn, uint32_t
 }
 #endif
 
 // -------------------------Runtime SSE2 Detection-----------------------------
 
 #define SSE2_EDX_MASK (1UL << 26)
 static qcms_bool sse2_available(void)
 {
-#ifdef HAS_CPUID
+#if defined(__x86_64__) || defined(_M_AMD64)
+       return true;
+#elif defined(HAS_CPUID)
        static int has_sse2 = -1;
        uint32_t a, b, c, d;
        uint32_t function = 0x00000001;
 
        if (has_sse2 == -1) {
               has_sse2 = 0;
 	      cpuid(function, &a, &b, &c, &d);
               if (d & SSE2_EDX_MASK)