Backed out changeset 6eec92f9276a from bug 416157 because the image tests need to be updated before it can be landed.
authorReed Loden <reed@reedloden.com>
Wed, 20 Aug 2008 02:03:01 -0500
changeset 17144 8fb5195b77ab7fb6207127db03260b8e7942f6e0
parent 17112 6eec92f9276a0fd425f5fd9419040859516e192b
child 17145 8ab229794bb6f8e64cede37755afba84a02fa50a
push id1441
push userreed@reedloden.com
push dateWed, 20 Aug 2008 07:04:39 +0000
treeherdermozilla-central@82d5560d2601 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
bugs416157
milestone1.9.1a2pre
backs out6eec92f9276a0fd425f5fd9419040859516e192b
Backed out changeset 6eec92f9276a from bug 416157 because the image tests need to be updated before it can be landed.
gfx/thebes/public/gfxPlatform.h
jpeg/jconfig.h
jpeg/jdapimin.c
jpeg/jdcoefct.c
jpeg/jddctmgr.c
jpeg/jidctfst.c
jpeg/jmemmgr.c
jpeg/jmorecfg.h
modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
--- a/gfx/thebes/public/gfxPlatform.h
+++ b/gfx/thebes/public/gfxPlatform.h
@@ -44,42 +44,16 @@
 
 #include "gfxTypes.h"
 #include "gfxASurface.h"
 
 #ifdef XP_OS2
 #undef OS2EMX_PLAIN_CHAR
 #endif
 
-/*
- * SSE2 Support Defines
- *
- * GFX_SSE2_POSSIBLE is defined if SSE2 support may be available.
- * On Mac/OSX Intel, SSE2 is always available. On Windows, SSE2
- * support is possible if the chip supports it so GFX_SSE2_POSSIBLE
- * is defined for Windows.
- *
- * GFX_SSE2_AVAILABLE indicates whether runtime SSE2 support is
- * available. On Mac/OSX Intel, it is defined to 1 (TRUE). On Windows
- * __sse2_available indicates whether support is available or not.
-*/
-
-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define GFX_SSE2_POSSIBLE
-#define GFX_SSE2_AVAILABLE __sse2_available
-#define GFX_SSE2_ALIGN __declspec(align(16))
-extern "C" int __sse2_available;
-#endif
-
-#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
-#define GFX_SSE2_POSSIBLE
-#define GFX_SSE2_AVAILABLE 1
-#define GFX_SSE2_ALIGN __attribute__ ((aligned (16)))
-#endif
-
 typedef void* cmsHPROFILE;
 typedef void* cmsHTRANSFORM;
 
 class gfxImageSurface;
 class gfxFont;
 class gfxFontGroup;
 struct gfxFontStyle;
 
--- a/jpeg/jconfig.h
+++ b/jpeg/jconfig.h
@@ -90,8 +90,18 @@
 
 #undef TWO_FILE_COMMANDLINE
 #undef NEED_SIGNAL_CATCHER
 #undef DONT_USE_B_MODE
 #undef PROGRESS_REPORT
 
 #endif /* JPEG_CJPEG_DJPEG */
 
+/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
+
+#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
+#define ALIGN16_const_vector_short(name) __declspec(align(16)) const short name[8]
+#define ALIGN16_const_vector_uchar(name) __declspec(align(16)) const unsigned char name[16]
+#else
+#define ALIGN16_const_vector_short(name) const short name[8] __attribute__ ((aligned (16)))
+#define ALIGN16_const_vector_uchar(name) const unsigned char name[16] __attribute__ ((aligned (16)))
+#endif /* ! XP_WIN32 && _M_IX86 && !__GNUC */
+
--- a/jpeg/jdapimin.c
+++ b/jpeg/jdapimin.c
@@ -62,20 +62,24 @@ NOT_SUPPORTED:
   CPUInfo[3] = my_edx;
 }
 #endif /* _MSC_VER >= 1400 */
 
 int MMXAvailable;
 static int mmxsupport();
 #endif
 
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
+#ifdef HAVE_SSE2_INTRINSICS
 int SSE2Available = 0;
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
 static int sse2support();
+#else
+static int sse2supportGCC();
 #endif /* HAVE_SSE2_INTEL_MNEMONICS */
+#endif /* HAVE_SSE2_INTRINSICS */
 
 
 /*
  * Initialization of a JPEG decompression object.
  * The error manager must already be set up (in case memory manager fails).
  */
 
 GLOBAL(void)
@@ -83,27 +87,37 @@ jpeg_CreateDecompress (j_decompress_ptr 
 {
   int i;
 
 #ifdef HAVE_MMX_INTEL_MNEMONICS
   static int cpuidDetected = 0;
 
   if(!cpuidDetected)
   {
-        MMXAvailable = mmxsupport();
+	MMXAvailable = mmxsupport();
 
 #ifdef HAVE_SSE2_INTEL_MNEMONICS
 	/* only do the sse2 support check if mmx is supported (so
 	   we know the processor supports cpuid) */
 	if (MMXAvailable)
 	    SSE2Available = sse2support();
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
+#endif
 
 	cpuidDetected = 1;
   }
+#else
+#ifdef HAVE_SSE2_INTRINSICS
+  static int cpuidDetected = 0;
+
+  if(!cpuidDetected) {
+    SSE2Available = sse2supportGCC();
+    cpuidDetected = 1;
+  }
+
+#endif /* HAVE_SSE2_INTRINSICS */
 #endif /* HAVE_MMX_INTEL_MNEMONICS */
 
   /* For debugging purposes, zero the whole master structure.
    * But error manager pointer is already there, so save and restore it.
    */
 
   /* Guard against version mismatches between library and caller. */
   cinfo->mem = NULL;		/* so jpeg_destroy knows mem mgr not called */
@@ -474,23 +488,43 @@ static int mmxsupport()
   int CPUInfo[4];
 
   __cpuid(CPUInfo, 1);
   if (CPUInfo[3] & (0x1 << 23))
     return 1;
   else
     return 0;
 }
-#endif /* HAVE_MMX_INTEL_MNEMONICS */
+#endif
 
 #ifdef HAVE_SSE2_INTEL_MNEMONICS
 static int sse2support()
 {
   int CPUInfo[4];
 
   __cpuid(CPUInfo, 1);
   if (CPUInfo[3] & (0x1 << 26))
     return 1;
   else
     return 2;
 }
+#else
+#ifdef HAVE_SSE2_INTRINSICS
+static int sse2supportGCC()
+{
+
+  /* Mac Intel started with Core Duo chips which have SSE2 Support */
+
+#if defined(__GNUC__) && defined(__i386__)
+#if defined(XP_MACOSX)
+  return 1;
+#endif /* XP_MACOSX */
+#endif /* GNUC && i386 */
+
+  /* Add checking for SSE2 support for other platforms here */
+
+  /* We don't have SSE2 intrinsics support */
+
+  return 2;
+}
+#endif /* HAVE_SSE2_INTRINSICS */
 #endif /* HAVE_SSE2_INTEL_MNEMONICS */
 
--- a/jpeg/jdcoefct.c
+++ b/jpeg/jdcoefct.c
@@ -40,21 +40,21 @@ typedef struct {
    * We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
    * and let the entropy decoder write into that workspace each time.
    * (On 80x86, the workspace is FAR even though it's not really very big;
    * this is to keep the module interfaces unchanged when a large coefficient
    * buffer is necessary.)
    * In multi-pass modes, this array points to the current MCU's blocks
    * within the virtual arrays; it is used only by the input side.
    */
-  SSE2_ALIGN JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
+  JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
 
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* In multi-pass modes, we need a virtual block array for each component. */
-  SSE2_ALIGN jvirt_barray_ptr whole_image[MAX_COMPONENTS];
+  jvirt_barray_ptr whole_image[MAX_COMPONENTS];
 #endif
 
 #ifdef BLOCK_SMOOTHING_SUPPORTED
   /* When doing block smoothing, we latch coefficient Al values here */
   int * coef_bits_latch;
 #define SAVED_COEFS  6		/* we save coef_bits[0..5] */
 #endif
 } my_coef_controller;
@@ -466,17 +466,17 @@ decompress_smooth_data (j_decompress_ptr
   int ci, block_row, block_rows, access_rows;
   JBLOCKARRAY buffer;
   JBLOCKROW buffer_ptr, prev_block_row, next_block_row;
   JSAMPARRAY output_ptr;
   JDIMENSION output_col;
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
   boolean first_row, last_row;
-  SSE2_ALIGN JBLOCK workspace;
+  JBLOCK workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
   INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
   int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
   int Al, pred;
 
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
@@ -718,31 +718,19 @@ jinit_d_coef_controller (j_decompress_pt
 #else
     ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
   } else {
     /* We only need a single-MCU buffer. */
     JBLOCKROW buffer;
     int i;
 
-#ifdef HAVE_SSE2_INTRINSICS
-    /* Align on 16-byte boundaries for the SSE2 code. */
-    /* This won't work for 64-bit systems so conditional code it. */
-
-    PRUptrdiff buffer_shift;
-
-    buffer_shift = (PRUptrdiff)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK) + 15);
-    buffer = (JBLOCKROW) ((buffer_shift + 15) & ~15);
-#else
     buffer = (JBLOCKROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				  D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
-#endif /* HAVE_SSE2_INTRINSICS */
     for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
     }
     coef->pub.consume_data = dummy_consume_data;
     coef->pub.decompress_data = decompress_onepass;
     coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
   }
 }
--- a/jpeg/jddctmgr.c
+++ b/jpeg/jddctmgr.c
@@ -14,17 +14,17 @@
  * dequantization as well as the IDCT proper.  This module sets up the
  * dequantization multiplier table needed by the IDCT routine.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
+#ifdef HAVE_SSE2_INTRINSICS
 extern int SSE2Available;
 #endif
 
 /*
  * The decompressor input side (jdinput.c) saves away the appropriate
  * quantization table for each component at the start of the first scan
  * involving that component.  (This is necessary in order to correctly
  * decode files that reuse Q-table slots.)
@@ -75,34 +75,24 @@ typedef union {
 #ifdef DCT_ISLOW_SUPPORTED
 #define PROVIDE_ISLOW_TABLES
 #else
 #ifdef IDCT_SCALING_SUPPORTED
 #define PROVIDE_ISLOW_TABLES
 #endif
 #endif
 
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
 GLOBAL(void)
 jpeg_idct_islow_sse2 (
 	j_decompress_ptr cinfo, 
 	jpeg_component_info * compptr,
 	JCOEFPTR coef_block,
 	JSAMPARRAY output_buf, 
 	JDIMENSION output_col);
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
 
-#ifdef HAVE_SSE2_INTRINSICS
-jpeg_idct_ifast_sse2 (
-        j_decompress_ptr cinfo, 
-        jpeg_component_info * compptr,
-        JCOEFPTR coef_block,
-        JSAMPARRAY output_buf, 
-        JDIMENSION output_col);
-#endif /* HAVE_SSE2_INTRINSICS */
 
 /*
  * Prepare for an output pass.
  * Here we select the proper IDCT routine for each component and build
  * a matching multiplier table.
  */
 
 METHODDEF(void)
@@ -152,23 +142,31 @@ start_pass (j_decompress_ptr cinfo)
 		method_ptr = jpeg_idct_islow;
 		method = JDCT_ISLOW;
 		  
 #endif /* HAVE_SSE2_INTEL_MNEMONICS */
 	break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
       case JDCT_IFAST:
-#ifdef HAVE_SSE2_INTRINSICS
-        method_ptr = jpeg_idct_ifast_sse2;
-        method = JDCT_IFAST;
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
+		if (SSE2Available==1) 
+		{
+			method_ptr = jpeg_idct_islow_sse2;
+			method = JDCT_ISLOW;
+		}
+		else
+		{
+			method_ptr = jpeg_idct_ifast;
+			method = JDCT_IFAST;
+		}
 #else
-        method_ptr = jpeg_idct_ifast;
-        method = JDCT_IFAST;
-#endif /* HAVE_SSE2_INTRINSICS */
+		method_ptr = jpeg_idct_ifast;
+		method = JDCT_IFAST;
+#endif /* HAVE_SSE2_INTEL_MNEMONICS */
 	break;
 
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
       case JDCT_FLOAT:
 	method_ptr = jpeg_idct_float;
 	method = JDCT_FLOAT;
 	break;
@@ -294,28 +292,16 @@ jinit_inverse_dct (j_decompress_ptr cinf
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				SIZEOF(my_idct_controller));
   cinfo->idct = (struct jpeg_inverse_dct *) idct;
   idct->pub.start_pass = start_pass;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Allocate and pre-zero a multiplier table for each component */
-#ifdef HAVE_SSE2_INTRINSICS
-
-    /* Align dct_table on 16 bytes for SSE2 IDCT IFAST optimization */
-
-    PRUptrdiff buffer_pointer;
-
-    buffer_pointer = (PRUptrdiff)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  SIZEOF(multiplier_table) + 15);
-    compptr->dct_table = (void *) ((buffer_pointer + 15) & ~15);
-#else
     compptr->dct_table =
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				  SIZEOF(multiplier_table));
-#endif /* HAVE_SSE2_INTRINSICS */
     MEMZERO(compptr->dct_table, SIZEOF(multiplier_table));
     /* Mark multiplier table not yet set up for any method */
     idct->cur_method[ci] = -1;
   }
 }
--- a/jpeg/jidctfst.c
+++ b/jpeg/jidctfst.c
@@ -166,23 +166,16 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci
 		 JCOEFPTR coef_block,
 		 JSAMPARRAY output_buf, JDIMENSION output_col);
 __inline GLOBAL(void)
 jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 		 JCOEFPTR coef_block,
 		 JSAMPARRAY output_buf, JDIMENSION output_col);
 #endif
 
-#ifdef HAVE_SSE2_INTRINSICS
-GLOBAL(void)
-jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col);
-#endif /* HAVE_SSE2_INTRINSICS */
-
 GLOBAL(void)
 jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info * compptr,
 		 JCOEFPTR coef_block,
 		 JSAMPARRAY output_buf, JDIMENSION output_col);
 
 
 #ifdef HAVE_MMX_INTEL_MNEMONICS
 GLOBAL(void)
@@ -1649,463 +1642,9 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci
 
 	movq		[ebx], mm3
 
 	emms
 	}
 }
 #endif
 
-#ifdef HAVE_SSE2_INTRINSICS
-#include "xmmintrin.h"
-#include "emmintrin.h"
-
-/* These values are taken from the decimal shifted left 14. When multiplying */
-/* by these constants, we have to shift left the other multiplier so that we */
-/* we can use pmulhw which will descale us by the 16 bits.                   */
-
-SSE2_ALIGN unsigned short SSE2_1_414213562[8] =
-  {23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170};
-SSE2_ALIGN unsigned short SSE2_1_847759065[8] =
-  {30274, 30274, 30274, 30274, 30274, 30274, 30274, 30274};
-SSE2_ALIGN unsigned short SSE2_1_082392200[8] =
-  {17734, 17734, 17734, 17734, 17734, 17734, 17734, 17734};
-SSE2_ALIGN unsigned short SSE2_NEG_0_765366865[8] =
-  {52996, 52996, 52996, 52996, 52996, 52996, 52996, 52996};
-
-/* This following is to compensate for range_limit and is shifted five to */
-/* the left as that's what we're scaled for at the time.                  */
-
-SSE2_ALIGN unsigned short SSE2_ADD_128[8] =
-  {4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096};
-
-/*---------------------------------------------------------------------------*/
-
-inline GLOBAL(void)
-     jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                           JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  __m128i tmp10, tmp11, tmp12, tmp13, z10, z11, z12, z13;
-  __m128i xps0, xps1, xps2, xps3, rowa;
-  __m128i * quantptr;
-  __m128i * coefptr;
-  JSAMPROW outptr0, outptr1, outptr2, outptr3, outptr4, outptr5, outptr6, outptr7;
-
-  /* ====================================================================== */
-  /* Load registers from the coefficient block                              */
-  /* ====================================================================== */
-
-  quantptr = (__m128i *) compptr->dct_table;
-  coefptr  = (__m128i *) coef_block;
-
-  row0 = *(coefptr+0);
-  row1 = *(coefptr+1);
-  row2 = *(coefptr+2);
-  row3 = *(coefptr+3);
-  row4 = *(coefptr+4);
-  row5 = *(coefptr+5);
-  row6 = *(coefptr+6);
-  row7 = *(coefptr+7);
-  row0 = _mm_mullo_epi16(row0, *(quantptr+0));
-  row1 = _mm_mullo_epi16(row1, *(quantptr+1));
-  row2 = _mm_mullo_epi16(row2, *(quantptr+2));
-  row3 = _mm_mullo_epi16(row3, *(quantptr+3));
-  row4 = _mm_mullo_epi16(row4, *(quantptr+4));
-  row5 = _mm_mullo_epi16(row5, *(quantptr+5));
-  row6 = _mm_mullo_epi16(row6, *(quantptr+6));
-  row7 = _mm_mullo_epi16(row7, *(quantptr+7));
-
-  /* ====================================================================== */
-  /* Inverse Discrete Cosine Transform Column Processing                    */
-  /* ====================================================================== */
-
-  /* Even part */
-
-  /*  tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
-  /*  tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
-
-  tmp10 = _mm_add_epi16(row0, row4);
-  tmp11 = _mm_sub_epi16(row0, row4);
-
-  /*  tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
-  /*  tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */
-  /*        - tmp13; */
-
-  tmp13 = _mm_add_epi16(row2, row6);
-  tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2),
-                          *((__m128i *) SSE2_1_414213562));
-  tmp12 = _mm_sub_epi16(tmp12, tmp13);
-
-  /*  tmp0 = tmp10 + tmp13; */
-  /*  tmp3 = tmp10 - tmp13; */
-  /*  tmp1 = tmp11 + tmp12; */
-  /*  tmp2 = tmp11 - tmp12; */
-
-  tmp0 = _mm_add_epi16(tmp10, tmp13);
-  tmp3 = _mm_sub_epi16(tmp10, tmp13);
-  tmp1 = _mm_add_epi16(tmp11, tmp12);
-  tmp2 = _mm_sub_epi16(tmp11, tmp12);
-
-  /* Odd part */
-
-  /*  z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
-  /*  z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
-  /*  z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
-  /*  z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
-
-  z13 = _mm_add_epi16(row5, row3);
-  z10 = _mm_sub_epi16(row5, row3);
-  z11 = _mm_add_epi16(row1, row7);
-  z12 = _mm_sub_epi16(row1, row7);
-
-  /* phase 5 */
-
-  /*  tmp7 = z11 + z13; */
-  /*  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */
-
-  tmp7 = _mm_add_epi16(z11, z13);
-  tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2),
-                          *((__m128i *) SSE2_1_414213562));
-
-  /*  z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */
-  /*  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */
-  /*  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */
-
-  /* This is really a mess as the negative 2.61 doesn't work well */
-  /* as we're working with 16-bit integers. So we're going to apply */
-  /* a little algebra to get: */
-
-  /*  tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */
-  /*  tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */
-
-  /* or */
-
-  /*  tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */
-  /*  tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */
-
-  z10 = _mm_slli_epi16(z10, 2);
-  z12 = _mm_slli_epi16(z12, 2);
-
-  tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)),
-                        _mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065)));
-  tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)),
-                        _mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065)));
-
-  /* phase 2 */
-
-  /*  tmp6 = tmp12 - tmp7; */
-  /*  tmp5 = tmp11 - tmp6; */
-  /*  tmp4 = tmp10 + tmp5; */
-
-  tmp6 = _mm_sub_epi16(tmp12, tmp7);
-  tmp5 = _mm_sub_epi16(tmp11, tmp6);
-  tmp4 = _mm_add_epi16(tmp10, tmp5);
-
-  /*  outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)  */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row0 = _mm_add_epi16(tmp0, tmp7);
-  row7 = _mm_sub_epi16(tmp0, tmp7);
-
-  /*  outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row1 = _mm_add_epi16(tmp1, tmp6);
-  row6 = _mm_sub_epi16(tmp1, tmp6);
-
-  /*  outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row2 = _mm_add_epi16(tmp2, tmp5);
-  row5 = _mm_sub_epi16(tmp2, tmp5);
-
-  /*  outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row4 = _mm_add_epi16(tmp3, tmp4);
-  row3 = _mm_sub_epi16(tmp3, tmp4);
-
-  /* ====================================================================== */
-  /* Transpose Matrix (Word)                                                */
-  /* ====================================================================== */
-
-  /* Save off the low quadwords from the even rows */
-
-  xps0 = _mm_move_epi64(row0);
-  xps1 = _mm_move_epi64(row2);
-  xps2 = _mm_move_epi64(row4);
-  xps3 = _mm_move_epi64(row6);
-
-  /* Interleave the high quadwords */
-
-  row0 = _mm_unpackhi_epi16(row0, row1);    /* 04 14 05 15 06 16 07 17 */
-  row2 = _mm_unpackhi_epi16(row2, row3);    /* 24 34 25 35 26 36 27 37 */
-  row4 = _mm_unpackhi_epi16(row4, row5);    /* 44 54 45 55 46 56 47 57 */
-  row6 = _mm_unpackhi_epi16(row6, row7);    /* 64 74 65 75 66 76 67 77 */ 
-
-  rowa = row0;
-  row0 = _mm_unpacklo_epi32(row0, row2);    /* 04 14 24 34 05 15 25 35 */
-  rowa = _mm_unpackhi_epi32(rowa, row2);    /* 06 16 26 36 07 17 27 37 */
-
-  row2 = row4;
-  row4 = _mm_unpacklo_epi32(row4, row6);    /* 44 54 64 74 45 55 65 75 */
-  row2 = _mm_unpackhi_epi32(row2, row6);    /* 46 56 66 76 47 57 67 77 */
-
-  row6 = row0;
-  row0 = _mm_unpacklo_epi64(row0, row4);    /* 04 14 24 34 44 54 64 74 */
-  row6 = _mm_unpackhi_epi64(row6, row4);    /* 05 15 25 35 45 55 65 75 */
-
-  row4 = rowa;
-  rowa = _mm_unpacklo_epi64(rowa, row2);    /* 06 16 26 36 46 56 66 76 */
-  row4 = _mm_unpackhi_epi64(row4, row2);    /* 07 17 27 37 47 57 67 77 */
-
-  /* row0, row6, rowa, row4 now contain *4, *5, *6, *7 */
-
-  /* Interleave the low quadwords */
-
-  xps0 = _mm_unpacklo_epi16(xps0, row1);    /* 00 10 01 11 02 12 03 13 */
-  xps1 = _mm_unpacklo_epi16(xps1, row3);    /* 20 30 21 31 22 32 23 33 */
-  xps2 = _mm_unpacklo_epi16(xps2, row5);    /* 40 50 41 51 42 52 43 43 */
-  xps3 = _mm_unpacklo_epi16(xps3, row7);    /* 60 70 61 71 62 72 63 73 */
-
-  row2 = xps0;
-  xps0 = _mm_unpacklo_epi32(xps0, xps1);    /* 00 10 20 30 01 11 21 31 */
-  row2 = _mm_unpackhi_epi32(row2, xps1);    /* 02 12 22 32 03 13 23 33 */
-
-  xps1 = xps2;
-  xps2 = _mm_unpacklo_epi32(xps2, xps3);    /* 40 50 60 70 41 51 61 71 */
-  xps1 = _mm_unpackhi_epi32(xps1, xps3);    /* 42 52 62 72 43 53 63 73 */
-
-  xps3 = xps0;
-  xps0 = _mm_unpacklo_epi64(xps0, xps2);    /* 00 10 20 30 40 50 60 70 */
-  xps3 = _mm_unpackhi_epi64(xps3, xps2);    /* 01 11 21 31 41 51 61 71 */
-
-  xps2 = row2;
-  row2 = _mm_unpacklo_epi64(row2, xps1);    /* 02 12 22 32 42 52 62 72 */
-  xps2 = _mm_unpackhi_epi64(xps2, xps1);    /* 03 13 23 33 43 53 63 73 */
-
-  /* xps0, xps3, row2, xps2 now contain *0, *1, *2, *3 */
-
-  /* Restructure back to row0 to row7. */
-
-  row7 = row4;
-  row5 = row6;
-  row6 = rowa;
-  row4 = row0;
-  row0 = xps0;
-  row1 = xps3;
-  row3 = xps2;
-
-  /* ====================================================================== */
-  /* Inverse Discrete Cosine Transform Row Processing                       */
-  /* ====================================================================== */
-
-  /* Even part */
-
-  /*  tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
-  /*  tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
-
-  tmp10 = _mm_add_epi16(row0, row4);
-  tmp11 = _mm_sub_epi16(row0, row4);
-
-  /*  tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
-  /*  tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */
-  /*        - tmp13; */
-
-  tmp13 = _mm_add_epi16(row2, row6);
-  tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2),
-                          *((__m128i *) SSE2_1_414213562));
-  tmp12 = _mm_sub_epi16(tmp12, tmp13);
-
-  /*  tmp0 = tmp10 + tmp13; */
-  /*  tmp3 = tmp10 - tmp13; */
-  /*  tmp1 = tmp11 + tmp12; */
-  /*  tmp2 = tmp11 - tmp12; */
-
-  tmp0 = _mm_add_epi16(tmp10, tmp13);
-  tmp3 = _mm_sub_epi16(tmp10, tmp13);
-  tmp1 = _mm_add_epi16(tmp11, tmp12);
-  tmp2 = _mm_sub_epi16(tmp11, tmp12);
-
-  /* Odd part */
-
-  /*  z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
-  /*  z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
-  /*  z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
-  /*  z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
-
-  z13 = _mm_add_epi16(row5, row3);
-  z10 = _mm_sub_epi16(row5, row3);
-  z11 = _mm_add_epi16(row1, row7);
-  z12 = _mm_sub_epi16(row1, row7);
-
-  /* phase 5 */
-
-  /*  tmp7 = z11 + z13; */
-  /*  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */
-
-  tmp7 = _mm_add_epi16(z11, z13);
-  tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2),
-                          *((__m128i *) SSE2_1_414213562));
-
-  /*  z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */
-  /*  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */
-  /*  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */
-
-  /* This is really a mess as the negative 2.61 doesn't work well */
-  /* as we're working with 16-bit integers. So we're going to apply */
-  /* a little algebra to get: */
-
-  /*  tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */
-  /*  tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */
-
-  /* or */
-
-  /*  tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */
-  /*  tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */
-
-  z10 = _mm_slli_epi16(z10, 2);
-  z12 = _mm_slli_epi16(z12, 2);
-
-  tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)),
-                        _mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065)));
-  tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)),
-                        _mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065)));
-
-  /* phase 2 */
-
-  /*  tmp6 = tmp12 - tmp7; */
-  /*  tmp5 = tmp11 - tmp6; */
-  /*  tmp4 = tmp10 + tmp5; */
-
-  tmp6 = _mm_sub_epi16(tmp12, tmp7);
-  tmp5 = _mm_sub_epi16(tmp11, tmp6);
-  tmp4 = _mm_add_epi16(tmp10, tmp5);
-
-  /*  outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)  */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row0 = _mm_add_epi16(tmp0, tmp7);
-  row7 = _mm_sub_epi16(tmp0, tmp7);
-
-  /*  outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row1 = _mm_add_epi16(tmp1, tmp6);
-  row6 = _mm_sub_epi16(tmp1, tmp6);
-
-  /*  outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row2 = _mm_add_epi16(tmp2, tmp5);
-  row5 = _mm_sub_epi16(tmp2, tmp5);
-
-  /*  outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row4 = _mm_add_epi16(tmp3, tmp4);
-  row3 = _mm_sub_epi16(tmp3, tmp4);
-
-  /* ====================================================================== */
-  /* Transpose Matrix (Byte)                                                */
-  /* ====================================================================== */
-
-  /* Shift right after adding half to descale */
-
-  row0 = _mm_add_epi16(row0, *((__m128i *) SSE2_ADD_128));
-  row1 = _mm_add_epi16(row1, *((__m128i *) SSE2_ADD_128));
-  row2 = _mm_add_epi16(row2, *((__m128i *) SSE2_ADD_128));
-  row3 = _mm_add_epi16(row3, *((__m128i *) SSE2_ADD_128));
-  row4 = _mm_add_epi16(row4, *((__m128i *) SSE2_ADD_128));
-  row5 = _mm_add_epi16(row5, *((__m128i *) SSE2_ADD_128));
-  row6 = _mm_add_epi16(row6, *((__m128i *) SSE2_ADD_128));
-  row7 = _mm_add_epi16(row7, *((__m128i *) SSE2_ADD_128));
-
-  row0 = _mm_srai_epi16(row0, 5);
-  row1 = _mm_srai_epi16(row1, 5);
-  row2 = _mm_srai_epi16(row2, 5);
-  row3 = _mm_srai_epi16(row3, 5);
-  row4 = _mm_srai_epi16(row4, 5);
-  row5 = _mm_srai_epi16(row5, 5);
-  row6 = _mm_srai_epi16(row6, 5);
-  row7 = _mm_srai_epi16(row7, 5);
-
-  /* Range Limit and convert Words to Bytes */
-
-  row0 = _mm_packus_epi16(row0, row4);
-  row1 = _mm_packus_epi16(row1, row5);
-  row2 = _mm_packus_epi16(row2, row6);
-  row3 = _mm_packus_epi16(row3, row7);
-
-  /* Transpose from rows back to columns */
-
-  row4 = _mm_unpackhi_epi8(row0, row1);   /* 40 50 41 51 42 52 43 43 ... */
-  row5 = _mm_unpackhi_epi8(row2, row3);   /* 60 70 61 71 62 72 63 73 ... */
-  row0 = _mm_unpacklo_epi8(row0, row1);   /* 00 10 01 11 02 12 03 13 ... */
-  row2 = _mm_unpacklo_epi8(row2, row3);   /* 20 30 21 31 22 32 23 33 ... */
-
-  row6 = _mm_unpackhi_epi16(row0, row2);  /* 04 14 24 34 05 15 25 35 ... */
-  row7 = _mm_unpackhi_epi16(row4, row5);  /* 44 54 64 74 45 55 65 75 ... */
-  row0 = _mm_unpacklo_epi16(row0, row2);  /* 00 10 20 30 01 11 21 31 ... */
-  row4 = _mm_unpacklo_epi16(row4, row5);  /* 40 50 60 70 41 51 61 71 ... */
-
-  row1 = _mm_unpackhi_epi32(row0, row4);  /* 02 12 22 32 42 52 62 72 ... */
-  row2 = _mm_unpackhi_epi32(row6, row7);  /* 06 16 26 36 46 56 66 76 ... */
-  row0 = _mm_unpacklo_epi32(row0, row4);  /* 00 10 20 30 40 50 60 70 ... */
-  row6 = _mm_unpacklo_epi32(row6, row7);  /* 04 14 24 34 55 54 64 74 ... */
-
-  /* Rearrange for readability */
-
-  row3 = row2;
-  row2 = row6;
-
-  /* ====================================================================== */
-  /* Final output                                                           */
-  /* ====================================================================== */
-
-  outptr0 = output_buf[0] + output_col;
-  outptr1 = output_buf[1] + output_col;
-  outptr2 = output_buf[2] + output_col;
-  outptr3 = output_buf[3] + output_col;
-  outptr4 = output_buf[4] + output_col;
-  outptr5 = output_buf[5] + output_col;
-  outptr6 = output_buf[6] + output_col;
-  outptr7 = output_buf[7] + output_col;
-
-  /* We can't use the faster FP instructions to write out the high quad */
-  /* so do a rotate right and write them out as the low quad            */
-
-  row4 = _mm_srli_si128(row0, 8);
-  row5 = _mm_srli_si128(row1, 8);
-  row6 = _mm_srli_si128(row2, 8);
-  row7 = _mm_srli_si128(row3, 8);
-
-  _mm_storel_epi64((__m128i *) outptr0, row0);
-  _mm_storel_epi64((__m128i *) outptr2, row1);
-  _mm_storel_epi64((__m128i *) outptr4, row2);
-  _mm_storel_epi64((__m128i *) outptr6, row3);
-  _mm_storel_epi64((__m128i *) outptr1, row4);
-  _mm_storel_epi64((__m128i *) outptr3, row5);
-  _mm_storel_epi64((__m128i *) outptr5, row6);
-  _mm_storel_epi64((__m128i *) outptr7, row7);
-}
-#endif /* HAVE_SSE2_INTRINSICS */
-
 #endif /* DCT_IFAST_SUPPORTED */
--- a/jpeg/jmemmgr.c
+++ b/jpeg/jmemmgr.c
@@ -444,19 +444,16 @@ alloc_barray (j_common_ptr cinfo, int po
 	      JDIMENSION blocksperrow, JDIMENSION numrows)
 /* Allocate a 2-D coefficient-block array */
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   JBLOCKARRAY result;
   JBLOCKROW workspace;
   JDIMENSION rowsperchunk, currow, i;
   long ltemp;
-#ifdef HAVE_SSE2_INTRINSICS
-  PRUptrdiff workspace_raw;
-#endif
 
   /* Calculate max # of rows allowed in one allocation chunk */
   ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
 	  ((long) blocksperrow * SIZEOF(JBLOCK));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
   if (ltemp < (long) numrows)
     rowsperchunk = (JDIMENSION) ltemp;
@@ -467,26 +464,19 @@ alloc_barray (j_common_ptr cinfo, int po
   /* Get space for row pointers (small object) */
   result = (JBLOCKARRAY) alloc_small(cinfo, pool_id,
 				     (size_t) (numrows * SIZEOF(JBLOCKROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
-#ifdef HAVE_SSE2_INTRINSICS
-    workspace_raw = (JBLOCKROW) alloc_large(cinfo, pool_id,
-        (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
-                  * SIZEOF(JBLOCK) + 15));
-    workspace = (JBLOCKROW) ((workspace_raw + 15) & ~15);
-#else
     workspace = (JBLOCKROW) alloc_large(cinfo, pool_id,
 	(size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
 		  * SIZEOF(JBLOCK)));
-#endif /* HAVE_SSE2_INTRINSICS */
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += blocksperrow;
     }
   }
 
   return result;
 }
--- a/jpeg/jmorecfg.h
+++ b/jpeg/jmorecfg.h
@@ -102,40 +102,32 @@ typedef short JSAMPLE;
  * Again, we allocate large arrays of these, but you can change to int
  * if you have memory to burn and "short" is really slow.
  */
 
 typedef short JCOEF;
 
 /* Defines for MMX/SSE2 support. */
 
-/* HAVE_SSE2_INTEL_MNEMONICS means Win32 and MSVC. HAVE_SSE2_INTRINSICS */
-/* means that the platform supports SSE2 or higher so that we don't     */
-/* have to check runtime CPU capabilities.                              */
+#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
+#define HAVE_MMX_INTEL_MNEMONICS 
 
-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define HAVE_MMX_INTEL_MNEMONICS
+/* SSE2 code appears broken for some cpus (bug 247437) */
 #define HAVE_SSE2_INTEL_MNEMONICS
+#define HAVE_SSE2_INTRINSICS
 #endif
 
-#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
+#if defined(__GNUC__) && defined(__i386__)
+#if defined(XP_MACOSX)
 #define HAVE_SSE2_INTRINSICS
-#endif /* GNUC && i386 && XP_MACOSX */
+#endif /* ! XP_MACOSX */
+#endif /* ! GNUC && i386 */
 
 /* Add support for other platforms here */
 
-/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
-/* We're only aligning Mac OSX structures right now */
-
-#ifdef HAVE_SSE2_INTRINSICS
-#define SSE2_ALIGN __attribute__ ((aligned (16)))
-#else
-#define SSE2_ALIGN
-#endif
-
 
 /* Compressed datastreams are represented as arrays of JOCTET.
  * These must be EXACTLY 8 bits wide, at least once they are written to
  * external storage.  Note that when using the stdio data source/destination
  * managers, this is also the data type passed to fread/fwrite.
  */
 
 #ifdef HAVE_UNSIGNED_CHAR
@@ -323,17 +315,17 @@ typedef unsigned char boolean;
  * QUANT_1PASS_SUPPORTED QUANT_2PASS_SUPPORTED
  */
 
 /* Arithmetic coding is unsupported for legal reasons.  Complaints to IBM. */
 
 /* Capability options common to encoder and decoder: */
 
 #define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
-#define DCT_IFAST_SUPPORTED     /* faster, less accurate integer method */
+#undef  DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
 #undef  DCT_FLOAT_SUPPORTED	/* floating-point: accurate, fast on fast HW */
 
 /* Encoder capability options: */
 
 #undef  C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
 #define ENTROPY_OPT_SUPPORTED	    /* Optimization of entropy coding parms? */
--- a/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
+++ b/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
@@ -51,20 +51,16 @@
 #include "nsIImage.h"
 #include "nsIInterfaceRequestorUtils.h"
 #include "gfxColor.h"
 
 #include "jerror.h"
 
 #include "gfxPlatform.h"
 
-#if defined(GFX_SSE2_POSSIBLE)
-#include "emmintrin.h"
-#endif /* GFX_SSE2_POSSIBLE */
-
 extern "C" {
 #include "iccjpeg.h"
 
 /* Colorspace conversion (copied from jpegint.h) */
 struct jpeg_color_deconverter {
   JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
   JMETHOD(void, color_convert, (j_decompress_ptr cinfo,
 				JSAMPIMAGE input_buf, JDIMENSION input_row,
@@ -533,21 +529,17 @@ nsresult nsJPEGDecoder::ProcessData(cons
   case JPEG_START_DECOMPRESS:
   {
     LOG_SCOPE(gJPEGlog, "nsJPEGDecoder::ProcessData -- entering JPEG_START_DECOMPRESS case");
     /* Step 4: set parameters for decompression */
 
     /* FIXME -- Should reset dct_method and dither mode
      * for final pass of progressive JPEG
      */
-#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
-    mInfo.dct_method =  JDCT_IFAST;
-#else
     mInfo.dct_method =  JDCT_ISLOW;
-#endif /* __GNUC__ & __i386__ & XP_MACOSX */
     mInfo.dither_mode = JDITHER_FS;
     mInfo.do_fancy_upsampling = TRUE;
     mInfo.enable_2pass_quant = FALSE;
     mInfo.do_block_smoothing = TRUE;
 
     /* Step 5: Start decompressor */
     if (jpeg_start_decompress(&mInfo) == FALSE) {
       PR_LOG(gJPEGDecoderAccountingLog, PR_LOG_DEBUG,
@@ -1263,171 +1255,50 @@ const int Cb_g_tab[(MAXJSAMPLE+1) * size
 	((shift_temp = (x)) < 0 ? \
 	 (shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \
 	 (shift_temp >> (shft)))
 #else
 #define SHIFT_TEMPS
 #define RIGHT_SHIFT(x,shft)	((x) >> (shft))
 #endif
 
-/* Constants for SSE2 support */
-
-#ifdef GFX_SSE2_POSSIBLE
-GFX_SSE2_ALIGN const short color_constants_1[8] = {128, 128, 128, 128, 128, 128, 128, 128};
-GFX_SSE2_ALIGN const short color_constants_2[8] = {359, 359, 359, 359, 359, 359, 359, 359};
-GFX_SSE2_ALIGN const short color_constants_3[8] = {88, 88, 88, 88, 88, 88, 88, 88};
-GFX_SSE2_ALIGN const short color_constants_4[8] = {183, 183, 183, 183, 183, 183, 183, 183};
-GFX_SSE2_ALIGN const short color_constants_5[8] = {454, 454, 454, 454, 454, 454, 454, 454};
-GFX_SSE2_ALIGN const short color_constants_6[8] = {11382, 11382, 11382, 11382, 11382, 11382, 11382, 11382};
-#endif /* GFX_SSE2_POSSIBLE */
 
 METHODDEF(void)
 ycc_rgb_convert_argb (j_decompress_ptr cinfo,
                  JSAMPIMAGE input_buf, JDIMENSION input_row,
                  JSAMPARRAY output_buf, int num_rows)
 {
   JDIMENSION num_cols = cinfo->output_width;
   JSAMPLE * range_limit = cinfo->sample_range_limit;
 
-/* Locals for SSE2 support */
-
-#ifdef GFX_SSE2_POSSIBLE
-  JSAMPROW inptr0, inptr1, inptr2, outptr;
-  JDIMENSION col;
-  __m128i row_Y, row_cb, row_cr, zeroes, ones;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
-  __m128i row_B, row_G, row_R;
-#endif /* GFX_SSE2_POSSIBLE */
-
   SHIFT_TEMPS
 
-#ifdef GFX_SSE2_POSSIBLE
-  if (GFX_SSE2_AVAILABLE == 1) {
-
-    /* Load registers with constants if we're going to do at least one quad */
-    
-    if (num_cols >= 8) {
-      zeroes = _mm_setzero_si128();
-      ones = _mm_cmpeq_epi8(zeroes, zeroes);
-    }
-    
-    while (--num_rows >= 0) {
-      inptr0 = input_buf[0][input_row];
-      inptr1 = input_buf[1][input_row];
-      inptr2 = input_buf[2][input_row];
-      input_row++;
-      outptr = *output_buf++;
-      
-      /* YCbCr -> RGB using scaled multiplication */
-      
-      for (col = 0; col + 8 <= num_cols; col += 8) {
-        
-        /* Read a row of each color component */
-
-        row_Y  = _mm_loadl_epi64((__m128i *) (inptr0 + col));
-        row_cb = _mm_loadl_epi64((__m128i *) inptr1);
-        row_cr = _mm_loadl_epi64((__m128i *) inptr2);
-        
-        /* Convert bytes to words */
-        
-        row_Y  = _mm_unpacklo_epi8(row_Y, zeroes);
-        row_cb = _mm_unpacklo_epi8(row_cb, zeroes);
-        row_cr = _mm_unpacklo_epi8(row_cr, zeroes);
-
-        /* Compute B and R */
-
-        tmp0 = _mm_sub_epi16(row_cb, *((__m128i *) color_constants_1));    /* cb - 128 */
-        tmp1 = _mm_sub_epi16(row_cr, *((__m128i *) color_constants_1));    /* cr - 128 */
-        tmp2 = _mm_slli_epi16(tmp0, 8);                                    /* (cb - 128) << 8 */
-        tmp3 = _mm_slli_epi16(tmp1, 8);                                    /* (cr - 128) << 8 */
-        tmp2 = _mm_mulhi_epi16(tmp2, *((__m128i *) color_constants_5));    /* (454 * ((cb - 128) << 8)) >> 16 */
-        tmp3 = _mm_mulhi_epi16(tmp3, *((__m128i *) color_constants_2));    /* (359 * ((cr - 128) << 8)) >> 16 */
-        row_B = _mm_add_epi16(tmp2, row_Y);                                /* B = y + (454 * ((cb - 128) << 8)) >> 16 */
-        row_R = _mm_add_epi16(tmp3, row_Y);                                /* R = y + (359 * ((cr - 128) << 8)) >> 16 */
-
-        /* Compute G */
-
-        tmp4 = _mm_mullo_epi16(row_cb, *((__m128i *) color_constants_3));    /* 88 * cb */
-        tmp5 = _mm_mullo_epi16(tmp1, *((__m128i *) color_constants_4));      /* 183 * (cr - 128) */
-        tmp4 = _mm_sub_epi16(tmp4, *((__m128i *) color_constants_6));        /* 88 * cb - (34806 - 128*256) */
-        tmp4 = _mm_add_epi16(tmp4, tmp5);                                    /* (88 * cb) + 183 * (cr - 128) - (34806 - 128*256) */
-        tmp4 = _mm_srai_epi16(tmp4, 8);                                      /* Shift the above right by 8 to divide by 256 */
-        row_G = _mm_sub_epi16(row_Y, tmp4);                                  /* G */
-
-        /* Pack word to byte with unsigned saturation which range limits too */
-
-        row_R = _mm_packus_epi16(row_R, zeroes);
-        row_G = _mm_packus_epi16(row_G, zeroes);
-        row_B = _mm_packus_epi16(row_B, zeroes);
+  /* This is used if we don't have SSE2 */
 
-        /* Pack horizontally */
-
-        tmp0 = _mm_unpacklo_epi8(row_B, row_G);          /* ABAB ABAB ABAB ABAB        */
-        tmp1 = _mm_unpacklo_epi8(row_R, ones);           /* GRGR GRGR GRGR GRGR        */
-        tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);           /* ARGB ARGB ARGB ARGB (low)  */
-        tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);           /* ARGB ARGB ARGB ARGB (high) */
-
-        /* Write out 32 bytes of ARGB */
-        
-        _mm_storel_epi64((__m128i *) outptr, tmp2);
-        _mm_storel_epi64((__m128i *) (outptr + 8), _mm_srli_si128(tmp2, 8));
-        _mm_storel_epi64((__m128i *) (outptr + 16), tmp3);
-        _mm_storel_epi64((__m128i *) (outptr + 24), _mm_srli_si128(tmp3, 8));
-        
-        /* Slide the pointers */
-        
-        /* inptr0 += 8; */
-        inptr1 += 8;
-        inptr2 += 8;
-        outptr += 32;
-      }
-
-      /* Process remainder using scalar code */
-      for (; col < num_cols; col++) {
-        int y  = GETJSAMPLE(inptr0[col]);
-        int cb = GETJSAMPLE(inptr1[0]);
-        int cr = GETJSAMPLE(inptr2[0]);
-        inptr1++;
-        inptr2++;
-        JSAMPLE * range_limit_y = range_limit + y;
-        /* Range-limiting is essential due to noise introduced by DCT losses. */
-        *((PRUint32 *) outptr) = 0xFF000000 |
-                                 ( range_limit_y[Cr_r_tab[cr]] << 16 ) |
-                                 ( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) |
-                                 ( range_limit_y[Cb_b_tab[cb]] );
-        outptr += 4;
-      }
-    }
-
-  } else 
-#endif /* GFX_SSE2_POSSIBLE */
-
-  /* This is used if we don't have SSE2 */
-  {
-    while (--num_rows >= 0) {
-      JSAMPROW inptr0 = input_buf[0][input_row];
-      JSAMPROW inptr1 = input_buf[1][input_row];
-      JSAMPROW inptr2 = input_buf[2][input_row];
-      input_row++;
-      PRUint32 *outptr = (PRUint32 *) *output_buf++;
-      for (JDIMENSION col = 0; col < num_cols; col++) {
-        int y  = GETJSAMPLE(inptr0[col]);
-        int cb = GETJSAMPLE(inptr1[col]);
-        int cr = GETJSAMPLE(inptr2[col]);
-        JSAMPLE * range_limit_y = range_limit + y;
-        /* Range-limiting is essential due to noise introduced by DCT losses. */
-        outptr[col] = 0xFF000000 |
-          ( range_limit_y[Cr_r_tab[cr]] << 16 ) |
-          ( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) |
-          ( range_limit_y[Cb_b_tab[cb]] );
-      }
+  while (--num_rows >= 0) {
+    JSAMPROW inptr0 = input_buf[0][input_row];
+    JSAMPROW inptr1 = input_buf[1][input_row];
+    JSAMPROW inptr2 = input_buf[2][input_row];
+    input_row++;
+    PRUint32 *outptr = (PRUint32 *) *output_buf++;
+    for (JDIMENSION col = 0; col < num_cols; col++) {
+      int y  = GETJSAMPLE(inptr0[col]);
+      int cb = GETJSAMPLE(inptr1[col]);
+      int cr = GETJSAMPLE(inptr2[col]);
+      JSAMPLE * range_limit_y = range_limit + y;
+      /* Range-limiting is essential due to noise introduced by DCT losses. */
+      outptr[col] = 0xFF000000 |
+                    ( range_limit_y[Cr_r_tab[cr]] << 16 ) |
+                    ( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) |
+                    ( range_limit_y[Cb_b_tab[cb]] );
     }
   }
 }
 
+
 /**************** Inverted CMYK -> RGB conversion **************/
 /*
  * Input is (Inverted) CMYK stored as 4 bytes per pixel.
  * Output is RGB stored as 3 bytes per pixel.
  * @param row Points to row buffer containing the CMYK bytes for each pixel in the row.
  * @param width Number of pixels in the row.
  */
 static void cmyk_convert_rgb(JSAMPROW row, JDIMENSION width)