Backed out changeset 6eec92f9276a from bug 416157 because the image tests need to be updated before it can be landed.
authorReed Loden <reed@reedloden.com>
Wed, 20 Aug 2008 02:03:01 -0500
changeset 17144 8fb5195b77ab7fb6207127db03260b8e7942f6e0
parent 17112 6eec92f9276a0fd425f5fd9419040859516e192b
child 17145 8ab229794bb6f8e64cede37755afba84a02fa50a
push idunknown
push userunknown
push dateunknown
bugs416157
milestone1.9.1a2pre
backs out6eec92f9276a0fd425f5fd9419040859516e192b
Backed out changeset 6eec92f9276a from bug 416157 because the image tests need to be updated before it can be landed.
gfx/thebes/public/gfxPlatform.h
jpeg/jconfig.h
jpeg/jdapimin.c
jpeg/jdcoefct.c
jpeg/jddctmgr.c
jpeg/jidctfst.c
jpeg/jmemmgr.c
jpeg/jmorecfg.h
modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
--- a/gfx/thebes/public/gfxPlatform.h
+++ b/gfx/thebes/public/gfxPlatform.h
@@ -44,42 +44,16 @@
 
 #include "gfxTypes.h"
 #include "gfxASurface.h"
 
 #ifdef XP_OS2
 #undef OS2EMX_PLAIN_CHAR
 #endif
 
-/*
- * SSE2 Support Defines
- *
- * GFX_SSE2_POSSIBLE is defined if SSE2 support may be available.
- * On Mac/OSX Intel, SSE2 is always available. On Windows, SSE2
- * support is possible if the chip supports it so GFX_SSE2_POSSIBLE
- * is defined for Windows.
- *
- * GFX_SSE2_AVAILABLE indicates whether runtime SSE2 support is
- * available. On Mac/OSX Intel, it is defined to 1 (TRUE). On Windows
- * __sse2_available indicates whether support is available or not.
-*/
-
-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define GFX_SSE2_POSSIBLE
-#define GFX_SSE2_AVAILABLE __sse2_available
-#define GFX_SSE2_ALIGN __declspec(align(16))
-extern "C" int __sse2_available;
-#endif
-
-#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
-#define GFX_SSE2_POSSIBLE
-#define GFX_SSE2_AVAILABLE 1
-#define GFX_SSE2_ALIGN __attribute__ ((aligned (16)))
-#endif
-
 typedef void* cmsHPROFILE;
 typedef void* cmsHTRANSFORM;
 
 class gfxImageSurface;
 class gfxFont;
 class gfxFontGroup;
 struct gfxFontStyle;
 
--- a/jpeg/jconfig.h
+++ b/jpeg/jconfig.h
@@ -90,8 +90,18 @@
 
 #undef TWO_FILE_COMMANDLINE
 #undef NEED_SIGNAL_CATCHER
 #undef DONT_USE_B_MODE
 #undef PROGRESS_REPORT
 
 #endif /* JPEG_CJPEG_DJPEG */
 
+/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
+
+#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
+#define ALIGN16_const_vector_short(name) __declspec(align(16)) const short name[8]
+#define ALIGN16_const_vector_uchar(name) __declspec(align(16)) const unsigned char name[16]
+#else
+#define ALIGN16_const_vector_short(name) const short name[8] __attribute__ ((aligned (16)))
+#define ALIGN16_const_vector_uchar(name) const unsigned char name[16] __attribute__ ((aligned (16)))
+#endif /* ! XP_WIN32 && _M_IX86 && !__GNUC */
+
--- a/jpeg/jdapimin.c
+++ b/jpeg/jdapimin.c
@@ -62,20 +62,24 @@ NOT_SUPPORTED:
   CPUInfo[3] = my_edx;
 }
 #endif /* _MSC_VER >= 1400 */
 
 int MMXAvailable;
 static int mmxsupport();
 #endif
 
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
+#ifdef HAVE_SSE2_INTRINSICS
 int SSE2Available = 0;
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
 static int sse2support();
+#else
+static int sse2supportGCC();
 #endif /* HAVE_SSE2_INTEL_MNEMONICS */
+#endif /* HAVE_SSE2_INTRINSICS */
 
 
 /*
  * Initialization of a JPEG decompression object.
  * The error manager must already be set up (in case memory manager fails).
  */
 
 GLOBAL(void)
@@ -83,27 +87,37 @@ jpeg_CreateDecompress (j_decompress_ptr 
 {
   int i;
 
 #ifdef HAVE_MMX_INTEL_MNEMONICS
   static int cpuidDetected = 0;
 
   if(!cpuidDetected)
   {
-        MMXAvailable = mmxsupport();
+	MMXAvailable = mmxsupport();
 
 #ifdef HAVE_SSE2_INTEL_MNEMONICS
 	/* only do the sse2 support check if mmx is supported (so
 	   we know the processor supports cpuid) */
 	if (MMXAvailable)
 	    SSE2Available = sse2support();
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
+#endif
 
 	cpuidDetected = 1;
   }
+#else
+#ifdef HAVE_SSE2_INTRINSICS
+  static int cpuidDetected = 0;
+
+  if(!cpuidDetected) {
+    SSE2Available = sse2supportGCC();
+    cpuidDetected = 1;
+  }
+
+#endif /* HAVE_SSE2_INTRINSICS */
 #endif /* HAVE_MMX_INTEL_MNEMONICS */
 
   /* For debugging purposes, zero the whole master structure.
    * But error manager pointer is already there, so save and restore it.
    */
 
   /* Guard against version mismatches between library and caller. */
   cinfo->mem = NULL;		/* so jpeg_destroy knows mem mgr not called */
@@ -474,23 +488,43 @@ static int mmxsupport()
   int CPUInfo[4];
 
   __cpuid(CPUInfo, 1);
   if (CPUInfo[3] & (0x1 << 23))
     return 1;
   else
     return 0;
 }
-#endif /* HAVE_MMX_INTEL_MNEMONICS */
+#endif
 
 #ifdef HAVE_SSE2_INTEL_MNEMONICS
 static int sse2support()
 {
   int CPUInfo[4];
 
   __cpuid(CPUInfo, 1);
   if (CPUInfo[3] & (0x1 << 26))
     return 1;
   else
     return 2;
 }
+#else
+#ifdef HAVE_SSE2_INTRINSICS
+static int sse2supportGCC()
+{
+
+  /* Mac Intel started with Core Duo chips which have SSE2 Support */
+
+#if defined(__GNUC__) && defined(__i386__)
+#if defined(XP_MACOSX)
+  return 1;
+#endif /* XP_MACOSX */
+#endif /* GNUC && i386 */
+
+  /* Add checking for SSE2 support for other platforms here */
+
+  /* We don't have SSE2 intrinsics support */
+
+  return 2;
+}
+#endif /* HAVE_SSE2_INTRINSICS */
 #endif /* HAVE_SSE2_INTEL_MNEMONICS */
 
--- a/jpeg/jdcoefct.c
+++ b/jpeg/jdcoefct.c
@@ -40,21 +40,21 @@ typedef struct {
    * We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
    * and let the entropy decoder write into that workspace each time.
    * (On 80x86, the workspace is FAR even though it's not really very big;
    * this is to keep the module interfaces unchanged when a large coefficient
    * buffer is necessary.)
    * In multi-pass modes, this array points to the current MCU's blocks
    * within the virtual arrays; it is used only by the input side.
    */
-  SSE2_ALIGN JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
+  JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
 
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* In multi-pass modes, we need a virtual block array for each component. */
-  SSE2_ALIGN jvirt_barray_ptr whole_image[MAX_COMPONENTS];
+  jvirt_barray_ptr whole_image[MAX_COMPONENTS];
 #endif
 
 #ifdef BLOCK_SMOOTHING_SUPPORTED
   /* When doing block smoothing, we latch coefficient Al values here */
   int * coef_bits_latch;
 #define SAVED_COEFS  6		/* we save coef_bits[0..5] */
 #endif
 } my_coef_controller;
@@ -466,17 +466,17 @@ decompress_smooth_data (j_decompress_ptr
   int ci, block_row, block_rows, access_rows;
   JBLOCKARRAY buffer;
   JBLOCKROW buffer_ptr, prev_block_row, next_block_row;
   JSAMPARRAY output_ptr;
   JDIMENSION output_col;
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
   boolean first_row, last_row;
-  SSE2_ALIGN JBLOCK workspace;
+  JBLOCK workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
   INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
   int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
   int Al, pred;
 
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
@@ -718,31 +718,19 @@ jinit_d_coef_controller (j_decompress_pt
 #else
     ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
   } else {
     /* We only need a single-MCU buffer. */
     JBLOCKROW buffer;
     int i;
 
-#ifdef HAVE_SSE2_INTRINSICS
-    /* Align on 16-byte boundaries for the SSE2 code. */
-    /* This won't work for 64-bit systems so conditional code it. */
-
-    PRUptrdiff buffer_shift;
-
-    buffer_shift = (PRUptrdiff)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK) + 15);
-    buffer = (JBLOCKROW) ((buffer_shift + 15) & ~15);
-#else
     buffer = (JBLOCKROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				  D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
-#endif /* HAVE_SSE2_INTRINSICS */
     for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
     }
     coef->pub.consume_data = dummy_consume_data;
     coef->pub.decompress_data = decompress_onepass;
     coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
   }
 }
--- a/jpeg/jddctmgr.c
+++ b/jpeg/jddctmgr.c
@@ -14,17 +14,17 @@
  * dequantization as well as the IDCT proper.  This module sets up the
  * dequantization multiplier table needed by the IDCT routine.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
+#ifdef HAVE_SSE2_INTRINSICS
 extern int SSE2Available;
 #endif
 
 /*
  * The decompressor input side (jdinput.c) saves away the appropriate
  * quantization table for each component at the start of the first scan
  * involving that component.  (This is necessary in order to correctly
  * decode files that reuse Q-table slots.)
@@ -75,34 +75,24 @@ typedef union {
 #ifdef DCT_ISLOW_SUPPORTED
 #define PROVIDE_ISLOW_TABLES
 #else
 #ifdef IDCT_SCALING_SUPPORTED
 #define PROVIDE_ISLOW_TABLES
 #endif
 #endif
 
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
 GLOBAL(void)
 jpeg_idct_islow_sse2 (
 	j_decompress_ptr cinfo, 
 	jpeg_component_info * compptr,
 	JCOEFPTR coef_block,
 	JSAMPARRAY output_buf, 
 	JDIMENSION output_col);
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
 
-#ifdef HAVE_SSE2_INTRINSICS
-jpeg_idct_ifast_sse2 (
-        j_decompress_ptr cinfo, 
-        jpeg_component_info * compptr,
-        JCOEFPTR coef_block,
-        JSAMPARRAY output_buf, 
-        JDIMENSION output_col);
-#endif /* HAVE_SSE2_INTRINSICS */
 
 /*
  * Prepare for an output pass.
  * Here we select the proper IDCT routine for each component and build
  * a matching multiplier table.
  */
 
 METHODDEF(void)
@@ -152,23 +142,31 @@ start_pass (j_decompress_ptr cinfo)
 		method_ptr = jpeg_idct_islow;
 		method = JDCT_ISLOW;
 		  
 #endif /* HAVE_SSE2_INTEL_MNEMONICS */
 	break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
       case JDCT_IFAST:
-#ifdef HAVE_SSE2_INTRINSICS
-        method_ptr = jpeg_idct_ifast_sse2;
-        method = JDCT_IFAST;
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
+		if (SSE2Available==1) 
+		{
+			method_ptr = jpeg_idct_islow_sse2;
+			method = JDCT_ISLOW;
+		}
+		else
+		{
+			method_ptr = jpeg_idct_ifast;
+			method = JDCT_IFAST;
+		}
 #else
-        method_ptr = jpeg_idct_ifast;
-        method = JDCT_IFAST;
-#endif /* HAVE_SSE2_INTRINSICS */
+		method_ptr = jpeg_idct_ifast;
+		method = JDCT_IFAST;
+#endif /* HAVE_SSE2_INTEL_MNEMONICS */
 	break;
 
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
       case JDCT_FLOAT:
 	method_ptr = jpeg_idct_float;
 	method = JDCT_FLOAT;
 	break;
@@ -294,28 +292,16 @@ jinit_inverse_dct (j_decompress_ptr cinf
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				SIZEOF(my_idct_controller));
   cinfo->idct = (struct jpeg_inverse_dct *) idct;
   idct->pub.start_pass = start_pass;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Allocate and pre-zero a multiplier table for each component */
-#ifdef HAVE_SSE2_INTRINSICS
-
-    /* Align dct_table on 16 bytes for SSE2 IDCT IFAST optimization */
-
-    PRUptrdiff buffer_pointer;
-
-    buffer_pointer = (PRUptrdiff)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  SIZEOF(multiplier_table) + 15);
-    compptr->dct_table = (void *) ((buffer_pointer + 15) & ~15);
-#else
     compptr->dct_table =
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				  SIZEOF(multiplier_table));
-#endif /* HAVE_SSE2_INTRINSICS */
     MEMZERO(compptr->dct_table, SIZEOF(multiplier_table));
     /* Mark multiplier table not yet set up for any method */
     idct->cur_method[ci] = -1;
   }
 }
--- a/jpeg/jidctfst.c
+++ b/jpeg/jidctfst.c
@@ -166,23 +166,16 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci
 		 JCOEFPTR coef_block,
 		 JSAMPARRAY output_buf, JDIMENSION output_col);
 __inline GLOBAL(void)
 jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 		 JCOEFPTR coef_block,
 		 JSAMPARRAY output_buf, JDIMENSION output_col);
 #endif
 
-#ifdef HAVE_SSE2_INTRINSICS
-GLOBAL(void)
-jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col);
-#endif /* HAVE_SSE2_INTRINSICS */
-
 GLOBAL(void)
 jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info * compptr,
 		 JCOEFPTR coef_block,
 		 JSAMPARRAY output_buf, JDIMENSION output_col);
 
 
 #ifdef HAVE_MMX_INTEL_MNEMONICS
 GLOBAL(void)
@@ -1649,463 +1642,9 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci
 
 	movq		[ebx], mm3
 
 	emms
 	}
 }
 #endif
 
-#ifdef HAVE_SSE2_INTRINSICS
-#include "xmmintrin.h"
-#include "emmintrin.h"
-
-/* These values are taken from the decimal shifted left 14. When multiplying */
-/* by these constants, we have to shift left the other multiplier so that we */
-/* we can use pmulhw which will descale us by the 16 bits.                   */
-
-SSE2_ALIGN unsigned short SSE2_1_414213562[8] =
-  {23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170};
-SSE2_ALIGN unsigned short SSE2_1_847759065[8] =
-  {30274, 30274, 30274, 30274, 30274, 30274, 30274, 30274};
-SSE2_ALIGN unsigned short SSE2_1_082392200[8] =
-  {17734, 17734, 17734, 17734, 17734, 17734, 17734, 17734};
-SSE2_ALIGN unsigned short SSE2_NEG_0_765366865[8] =
-  {52996, 52996, 52996, 52996, 52996, 52996, 52996, 52996};
-
-/* This following is to compensate for range_limit and is shifted five to */
-/* the left as that's what we're scaled for at the time.                  */
-
-SSE2_ALIGN unsigned short SSE2_ADD_128[8] =
-  {4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096};
-
-/*---------------------------------------------------------------------------*/
-
-inline GLOBAL(void)
-     jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                           JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  __m128i tmp10, tmp11, tmp12, tmp13, z10, z11, z12, z13;
-  __m128i xps0, xps1, xps2, xps3, rowa;
-  __m128i * quantptr;
-  __m128i * coefptr;
-  JSAMPROW outptr0, outptr1, outptr2, outptr3, outptr4, outptr5, outptr6, outptr7;
-
-  /* ====================================================================== */
-  /* Load registers from the coefficient block                              */
-  /* ====================================================================== */
-
-  quantptr = (__m128i *) compptr->dct_table;
-  coefptr  = (__m128i *) coef_block;
-
-  row0 = *(coefptr+0);
-  row1 = *(coefptr+1);
-  row2 = *(coefptr+2);
-  row3 = *(coefptr+3);
-  row4 = *(coefptr+4);
-  row5 = *(coefptr+5);
-  row6 = *(coefptr+6);
-  row7 = *(coefptr+7);
-  row0 = _mm_mullo_epi16(row0, *(quantptr+0));
-  row1 = _mm_mullo_epi16(row1, *(quantptr+1));
-  row2 = _mm_mullo_epi16(row2, *(quantptr+2));
-  row3 = _mm_mullo_epi16(row3, *(quantptr+3));
-  row4 = _mm_mullo_epi16(row4, *(quantptr+4));
-  row5 = _mm_mullo_epi16(row5, *(quantptr+5));
-  row6 = _mm_mullo_epi16(row6, *(quantptr+6));
-  row7 = _mm_mullo_epi16(row7, *(quantptr+7));
-
-  /* ====================================================================== */
-  /* Inverse Discrete Cosine Transform Column Processing                    */
-  /* ====================================================================== */
-
-  /* Even part */
-
-  /*  tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
-  /*  tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
-
-  tmp10 = _mm_add_epi16(row0, row4);
-  tmp11 = _mm_sub_epi16(row0, row4);
-
-  /*  tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
-  /*  tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */
-  /*        - tmp13; */
-
-  tmp13 = _mm_add_epi16(row2, row6);
-  tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2),
-                          *((__m128i *) SSE2_1_414213562));
-  tmp12 = _mm_sub_epi16(tmp12, tmp13);
-
-  /*  tmp0 = tmp10 + tmp13; */
-  /*  tmp3 = tmp10 - tmp13; */
-  /*  tmp1 = tmp11 + tmp12; */
-  /*  tmp2 = tmp11 - tmp12; */
-
-  tmp0 = _mm_add_epi16(tmp10, tmp13);
-  tmp3 = _mm_sub_epi16(tmp10, tmp13);
-  tmp1 = _mm_add_epi16(tmp11, tmp12);
-  tmp2 = _mm_sub_epi16(tmp11, tmp12);
-
-  /* Odd part */
-
-  /*  z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
-  /*  z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
-  /*  z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
-  /*  z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
-
-  z13 = _mm_add_epi16(row5, row3);
-  z10 = _mm_sub_epi16(row5, row3);
-  z11 = _mm_add_epi16(row1, row7);
-  z12 = _mm_sub_epi16(row1, row7);
-
-  /* phase 5 */
-
-  /*  tmp7 = z11 + z13; */
-  /*  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */
-
-  tmp7 = _mm_add_epi16(z11, z13);
-  tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2),
-                          *((__m128i *) SSE2_1_414213562));
-
-  /*  z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */
-  /*  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */
-  /*  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */
-
-  /* This is really a mess as the negative 2.61 doesn't work well */
-  /* as we're working with 16-bit integers. So we're going to apply */
-  /* a little algebra to get: */
-
-  /*  tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */
-  /*  tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */
-
-  /* or */
-
-  /*  tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */
-  /*  tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */
-
-  z10 = _mm_slli_epi16(z10, 2);
-  z12 = _mm_slli_epi16(z12, 2);
-
-  tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)),
-                        _mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065)));
-  tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)),
-                        _mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065)));
-
-  /* phase 2 */
-
-  /*  tmp6 = tmp12 - tmp7; */
-  /*  tmp5 = tmp11 - tmp6; */
-  /*  tmp4 = tmp10 + tmp5; */
-
-  tmp6 = _mm_sub_epi16(tmp12, tmp7);
-  tmp5 = _mm_sub_epi16(tmp11, tmp6);
-  tmp4 = _mm_add_epi16(tmp10, tmp5);
-
-  /*  outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)  */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row0 = _mm_add_epi16(tmp0, tmp7);
-  row7 = _mm_sub_epi16(tmp0, tmp7);
-
-  /*  outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row1 = _mm_add_epi16(tmp1, tmp6);
-  row6 = _mm_sub_epi16(tmp1, tmp6);
-
-  /*  outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row2 = _mm_add_epi16(tmp2, tmp5);
-  row5 = _mm_sub_epi16(tmp2, tmp5);
-
-  /*  outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row4 = _mm_add_epi16(tmp3, tmp4);
-  row3 = _mm_sub_epi16(tmp3, tmp4);
-
-  /* ====================================================================== */
-  /* Transpose Matrix (Word)                                                */
-  /* ====================================================================== */
-
-  /* Save off the low quadwords from the even rows */
-
-  xps0 = _mm_move_epi64(row0);
-  xps1 = _mm_move_epi64(row2);
-  xps2 = _mm_move_epi64(row4);
-  xps3 = _mm_move_epi64(row6);
-
-  /* Interleave the high quadwords */
-
-  row0 = _mm_unpackhi_epi16(row0, row1);    /* 04 14 05 15 06 16 07 17 */
-  row2 = _mm_unpackhi_epi16(row2, row3);    /* 24 34 25 35 26 36 27 37 */
-  row4 = _mm_unpackhi_epi16(row4, row5);    /* 44 54 45 55 46 56 47 57 */
-  row6 = _mm_unpackhi_epi16(row6, row7);    /* 64 74 65 75 66 76 67 77 */ 
-
-  rowa = row0;
-  row0 = _mm_unpacklo_epi32(row0, row2);    /* 04 14 24 34 05 15 25 35 */
-  rowa = _mm_unpackhi_epi32(rowa, row2);    /* 06 16 26 36 07 17 27 37 */
-
-  row2 = row4;
-  row4 = _mm_unpacklo_epi32(row4, row6);    /* 44 54 64 74 45 55 65 75 */
-  row2 = _mm_unpackhi_epi32(row2, row6);    /* 46 56 66 76 47 57 67 77 */
-
-  row6 = row0;
-  row0 = _mm_unpacklo_epi64(row0, row4);    /* 04 14 24 34 44 54 64 74 */
-  row6 = _mm_unpackhi_epi64(row6, row4);    /* 05 15 25 35 45 55 65 75 */
-
-  row4 = rowa;
-  rowa = _mm_unpacklo_epi64(rowa, row2);    /* 06 16 26 36 46 56 66 76 */
-  row4 = _mm_unpackhi_epi64(row4, row2);    /* 07 17 27 37 47 57 67 77 */
-
-  /* row0, row6, rowa, row4 now contain *4, *5, *6, *7 */
-
-  /* Interleave the low quadwords */
-
-  xps0 = _mm_unpacklo_epi16(xps0, row1);    /* 00 10 01 11 02 12 03 13 */
-  xps1 = _mm_unpacklo_epi16(xps1, row3);    /* 20 30 21 31 22 32 23 33 */
-  xps2 = _mm_unpacklo_epi16(xps2, row5);    /* 40 50 41 51 42 52 43 43 */
-  xps3 = _mm_unpacklo_epi16(xps3, row7);    /* 60 70 61 71 62 72 63 73 */
-
-  row2 = xps0;
-  xps0 = _mm_unpacklo_epi32(xps0, xps1);    /* 00 10 20 30 01 11 21 31 */
-  row2 = _mm_unpackhi_epi32(row2, xps1);    /* 02 12 22 32 03 13 23 33 */
-
-  xps1 = xps2;
-  xps2 = _mm_unpacklo_epi32(xps2, xps3);    /* 40 50 60 70 41 51 61 71 */
-  xps1 = _mm_unpackhi_epi32(xps1, xps3);    /* 42 52 62 72 43 53 63 73 */
-
-  xps3 = xps0;
-  xps0 = _mm_unpacklo_epi64(xps0, xps2);    /* 00 10 20 30 40 50 60 70 */
-  xps3 = _mm_unpackhi_epi64(xps3, xps2);    /* 01 11 21 31 41 51 61 71 */
-
-  xps2 = row2;
-  row2 = _mm_unpacklo_epi64(row2, xps1);    /* 02 12 22 32 42 52 62 72 */
-  xps2 = _mm_unpackhi_epi64(xps2, xps1);    /* 03 13 23 33 43 53 63 73 */
-
-  /* xps0, xps3, row2, xps2 now contain *0, *1, *2, *3 */
-
-  /* Restructure back to row0 to row7. */
-
-  row7 = row4;
-  row5 = row6;
-  row6 = rowa;
-  row4 = row0;
-  row0 = xps0;
-  row1 = xps3;
-  row3 = xps2;
-
-  /* ====================================================================== */
-  /* Inverse Discrete Cosine Transform Row Processing                       */
-  /* ====================================================================== */
-
-  /* Even part */
-
-  /*  tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
-  /*  tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
-
-  tmp10 = _mm_add_epi16(row0, row4);
-  tmp11 = _mm_sub_epi16(row0, row4);
-
-  /*  tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
-  /*  tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */
-  /*        - tmp13; */
-
-  tmp13 = _mm_add_epi16(row2, row6);
-  tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2),
-                          *((__m128i *) SSE2_1_414213562));
-  tmp12 = _mm_sub_epi16(tmp12, tmp13);
-
-  /*  tmp0 = tmp10 + tmp13; */
-  /*  tmp3 = tmp10 - tmp13; */
-  /*  tmp1 = tmp11 + tmp12; */
-  /*  tmp2 = tmp11 - tmp12; */
-
-  tmp0 = _mm_add_epi16(tmp10, tmp13);
-  tmp3 = _mm_sub_epi16(tmp10, tmp13);
-  tmp1 = _mm_add_epi16(tmp11, tmp12);
-  tmp2 = _mm_sub_epi16(tmp11, tmp12);
-
-  /* Odd part */
-
-  /*  z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
-  /*  z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
-  /*  z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
-  /*  z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
-
-  z13 = _mm_add_epi16(row5, row3);
-  z10 = _mm_sub_epi16(row5, row3);
-  z11 = _mm_add_epi16(row1, row7);
-  z12 = _mm_sub_epi16(row1, row7);
-
-  /* phase 5 */
-
-  /*  tmp7 = z11 + z13; */
-  /*  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */
-
-  tmp7 = _mm_add_epi16(z11, z13);
-  tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2),
-                          *((__m128i *) SSE2_1_414213562));
-
-  /*  z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */
-  /*  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */
-  /*  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */
-
-  /* This is really a mess as the negative 2.61 doesn't work well */
-  /* as we're working with 16-bit integers. So we're going to apply */
-  /* a little algebra to get: */
-
-  /*  tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */
-  /*  tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */
-
-  /* or */
-
-  /*  tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */
-  /*  tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */
-
-  z10 = _mm_slli_epi16(z10, 2);
-  z12 = _mm_slli_epi16(z12, 2);
-
-  tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)),
-                        _mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065)));
-  tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)),
-                        _mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065)));
-
-  /* phase 2 */
-
-  /*  tmp6 = tmp12 - tmp7; */
-  /*  tmp5 = tmp11 - tmp6; */
-  /*  tmp4 = tmp10 + tmp5; */
-
-  tmp6 = _mm_sub_epi16(tmp12, tmp7);
-  tmp5 = _mm_sub_epi16(tmp11, tmp6);
-  tmp4 = _mm_add_epi16(tmp10, tmp5);
-
-  /*  outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)  */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row0 = _mm_add_epi16(tmp0, tmp7);
-  row7 = _mm_sub_epi16(tmp0, tmp7);
-
-  /*  outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row1 = _mm_add_epi16(tmp1, tmp6);
-  row6 = _mm_sub_epi16(tmp1, tmp6);
-
-  /*  outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row2 = _mm_add_epi16(tmp2, tmp5);
-  row5 = _mm_sub_epi16(tmp2, tmp5);
-
-  /*  outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-  /*  outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
-  /*                        & RANGE_MASK]; */
-
-  row4 = _mm_add_epi16(tmp3, tmp4);
-  row3 = _mm_sub_epi16(tmp3, tmp4);
-
-  /* ====================================================================== */
-  /* Transpose Matrix (Byte)                                                */
-  /* ====================================================================== */
-
-  /* Shift right after adding half to descale */
-
-  row0 = _mm_add_epi16(row0, *((__m128i *) SSE2_ADD_128));
-  row1 = _mm_add_epi16(row1, *((__m128i *) SSE2_ADD_128));
-  row2 = _mm_add_epi16(row2, *((__m128i *) SSE2_ADD_128));
-  row3 = _mm_add_epi16(row3, *((__m128i *) SSE2_ADD_128));
-  row4 = _mm_add_epi16(row4, *((__m128i *) SSE2_ADD_128));
-  row5 = _mm_add_epi16(row5, *((__m128i *) SSE2_ADD_128));
-  row6 = _mm_add_epi16(row6, *((__m128i *) SSE2_ADD_128));
-  row7 = _mm_add_epi16(row7, *((__m128i *) SSE2_ADD_128));
-
-  row0 = _mm_srai_epi16(row0, 5);
-  row1 = _mm_srai_epi16(row1, 5);
-  row2 = _mm_srai_epi16(row2, 5);
-  row3 = _mm_srai_epi16(row3, 5);
-  row4 = _mm_srai_epi16(row4, 5);
-  row5 = _mm_srai_epi16(row5, 5);
-  row6 = _mm_srai_epi16(row6, 5);
-  row7 = _mm_srai_epi16(row7, 5);
-
-  /* Range Limit and convert Words to Bytes */
-
-  row0 = _mm_packus_epi16(row0, row4);
-  row1 = _mm_packus_epi16(row1, row5);
-  row2 = _mm_packus_epi16(row2, row6);
-  row3 = _mm_packus_epi16(row3, row7);
-
-  /* Transpose from rows back to columns */
-
-  row4 = _mm_unpackhi_epi8(row0, row1);   /* 40 50 41 51 42 52 43 43 ... */
-  row5 = _mm_unpackhi_epi8(row2, row3);   /* 60 70 61 71 62 72 63 73 ... */
-  row0 = _mm_unpacklo_epi8(row0, row1);   /* 00 10 01 11 02 12 03 13 ... */
-  row2 = _mm_unpacklo_epi8(row2, row3);   /* 20 30 21 31 22 32 23 33 ... */
-
-  row6 = _mm_unpackhi_epi16(row0, row2);  /* 04 14 24 34 05 15 25 35 ... */
-  row7 = _mm_unpackhi_epi16(row4, row5);  /* 44 54 64 74 45 55 65 75 ... */
-  row0 = _mm_unpacklo_epi16(row0, row2);  /* 00 10 20 30 01 11 21 31 ... */
-  row4 = _mm_unpacklo_epi16(row4, row5);  /* 40 50 60 70 41 51 61 71 ... */
-
-  row1 = _mm_unpackhi_epi32(row0, row4);  /* 02 12 22 32 42 52 62 72 ... */
-  row2 = _mm_unpackhi_epi32(row6, row7);  /* 06 16 26 36 46 56 66 76 ... */
-  row0 = _mm_unpacklo_epi32(row0, row4);  /* 00 10 20 30 40 50 60 70 ... */
-  row6 = _mm_unpacklo_epi32(row6, row7);  /* 04 14 24 34 55 54 64 74 ... */
-
-  /* Rearrange for readability */
-
-  row3 = row2;
-  row2 = row6;
-
-  /* ====================================================================== */
-  /* Final output                                                           */
-  /* ====================================================================== */
-
-  outptr0 = output_buf[0] + output_col;
-  outptr1 = output_buf[1] + output_col;
-  outptr2 = output_buf[2] + output_col;
-  outptr3 = output_buf[3] + output_col;
-  outptr4 = output_buf[4] + output_col;
-  outptr5 = output_buf[5] + output_col;
-  outptr6 = output_buf[6] + output_col;
-  outptr7 = output_buf[7] + output_col;
-
-  /* We can't use the faster FP instructions to write out the high quad */
-  /* so do a rotate right and write them out as the low quad            */
-
-  row4 = _mm_srli_si128(row0, 8);
-  row5 = _mm_srli_si128(row1, 8);
-  row6 = _mm_srli_si128(row2, 8);
-  row7 = _mm_srli_si128(row3, 8);
-
-  _mm_storel_epi64((__m128i *) outptr0, row0);
-  _mm_storel_epi64((__m128i *) outptr2, row1);
-  _mm_storel_epi64((__m128i *) outptr4, row2);
-  _mm_storel_epi64((__m128i *) outptr6, row3);
-  _mm_storel_epi64((__m128i *) outptr1, row4);
-  _mm_storel_epi64((__m128i *) outptr3, row5);
-  _mm_storel_epi64((__m128i *) outptr5, row6);
-  _mm_storel_epi64((__m128i *) outptr7, row7);
-}
-#endif /* HAVE_SSE2_INTRINSICS */
-
 #endif /* DCT_IFAST_SUPPORTED */
--- a/jpeg/jmemmgr.c
+++ b/jpeg/jmemmgr.c
@@ -444,19 +444,16 @@ alloc_barray (j_common_ptr cinfo, int po
 	      JDIMENSION blocksperrow, JDIMENSION numrows)
 /* Allocate a 2-D coefficient-block array */
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   JBLOCKARRAY result;
   JBLOCKROW workspace;
   JDIMENSION rowsperchunk, currow, i;
   long ltemp;
-#ifdef HAVE_SSE2_INTRINSICS
-  PRUptrdiff workspace_raw;
-#endif
 
   /* Calculate max # of rows allowed in one allocation chunk */
   ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
 	  ((long) blocksperrow * SIZEOF(JBLOCK));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
   if (ltemp < (long) numrows)
     rowsperchunk = (JDIMENSION) ltemp;
@@ -467,26 +464,19 @@ alloc_barray (j_common_ptr cinfo, int po
   /* Get space for row pointers (small object) */
   result = (JBLOCKARRAY) alloc_small(cinfo, pool_id,
 				     (size_t) (numrows * SIZEOF(JBLOCKROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
-#ifdef HAVE_SSE2_INTRINSICS
-    workspace_raw = (JBLOCKROW) alloc_large(cinfo, pool_id,
-        (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
-                  * SIZEOF(JBLOCK) + 15));
-    workspace = (JBLOCKROW) ((workspace_raw + 15) & ~15);
-#else
     workspace = (JBLOCKROW) alloc_large(cinfo, pool_id,
 	(size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
 		  * SIZEOF(JBLOCK)));
-#endif /* HAVE_SSE2_INTRINSICS */
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += blocksperrow;
     }
   }
 
   return result;
 }
--- a/jpeg/jmorecfg.h
+++ b/jpeg/jmorecfg.h
@@ -102,40 +102,32 @@ typedef short JSAMPLE;
  * Again, we allocate large arrays of these, but you can change to int
  * if you have memory to burn and "short" is really slow.
  */
 
 typedef short JCOEF;
 
 /* Defines for MMX/SSE2 support. */
 
-/* HAVE_SSE2_INTEL_MNEMONICS means Win32 and MSVC. HAVE_SSE2_INTRINSICS */
-/* means that the platform supports SSE2 or higher so that we don't     */
-/* have to check runtime CPU capabilities.                              */
+#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
+#define HAVE_MMX_INTEL_MNEMONICS 
 
-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define HAVE_MMX_INTEL_MNEMONICS
+/* SSE2 code appears broken for some cpus (bug 247437) */
 #define HAVE_SSE2_INTEL_MNEMONICS
+#define HAVE_SSE2_INTRINSICS
 #endif
 
-#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
+#if defined(__GNUC__) && defined(__i386__)
+#if defined(XP_MACOSX)
 #define HAVE_SSE2_INTRINSICS
-#endif /* GNUC && i386 && XP_MACOSX */
+#endif /* ! XP_MACOSX */
+#endif /* ! GNUC && i386 */
 
 /* Add support for other platforms here */
 
-/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
-/* We're only aligning Mac OSX structures right now */
-
-#ifdef HAVE_SSE2_INTRINSICS
-#define SSE2_ALIGN __attribute__ ((aligned (16)))
-#else
-#define SSE2_ALIGN
-#endif
-
 
 /* Compressed datastreams are represented as arrays of JOCTET.
  * These must be EXACTLY 8 bits wide, at least once they are written to
  * external storage.  Note that when using the stdio data source/destination
  * managers, this is also the data type passed to fread/fwrite.
  */
 
 #ifdef HAVE_UNSIGNED_CHAR
@@ -323,17 +315,17 @@ typedef unsigned char boolean;
  * QUANT_1PASS_SUPPORTED QUANT_2PASS_SUPPORTED
  */
 
 /* Arithmetic coding is unsupported for legal reasons.  Complaints to IBM. */
 
 /* Capability options common to encoder and decoder: */
 
 #define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
-#define DCT_IFAST_SUPPORTED     /* faster, less accurate integer method */
+#undef  DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
 #undef  DCT_FLOAT_SUPPORTED	/* floating-point: accurate, fast on fast HW */
 
 /* Encoder capability options: */
 
 #undef  C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
 #define ENTROPY_OPT_SUPPORTED	    /* Optimization of entropy coding parms? */
--- a/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
+++ b/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
@@ -51,20 +51,16 @@
 #include "nsIImage.h"
 #include "nsIInterfaceRequestorUtils.h"
 #include "gfxColor.h"
 
 #include "jerror.h"
 
 #include "gfxPlatform.h"
 
-#if defined(GFX_SSE2_POSSIBLE)
-#include "emmintrin.h"
-#endif /* GFX_SSE2_POSSIBLE */
-
 extern "C" {
 #include "iccjpeg.h"
 
 /* Colorspace conversion (copied from jpegint.h) */
 struct jpeg_color_deconverter {
   JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
   JMETHOD(void, color_convert, (j_decompress_ptr cinfo,
 				JSAMPIMAGE input_buf, JDIMENSION input_row,
@@ -533,21 +529,17 @@ nsresult nsJPEGDecoder::ProcessData(cons
   case JPEG_START_DECOMPRESS:
   {
     LOG_SCOPE(gJPEGlog, "nsJPEGDecoder::ProcessData -- entering JPEG_START_DECOMPRESS case");
     /* Step 4: set parameters for decompression */
 
     /* FIXME -- Should reset dct_method and dither mode
      * for final pass of progressive JPEG
      */
-#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
-    mInfo.dct_method =  JDCT_IFAST;
-#else
     mInfo.dct_method =  JDCT_ISLOW;
-#endif /* __GNUC__ & __i386__ & XP_MACOSX */
     mInfo.dither_mode = JDITHER_FS;
     mInfo.do_fancy_upsampling = TRUE;
     mInfo.enable_2pass_quant = FALSE;
     mInfo.do_block_smoothing = TRUE;
 
     /* Step 5: Start decompressor */
     if (jpeg_start_decompress(&mInfo) == FALSE) {
       PR_LOG(gJPEGDecoderAccountingLog, PR_LOG_DEBUG,
@@ -1263,171 +1255,50 @@ const int Cb_g_tab[(MAXJSAMPLE+1) * size
 	((shift_temp = (x)) < 0 ? \
 	 (shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \
 	 (shift_temp >> (shft)))
 #else
 #define SHIFT_TEMPS
 #define RIGHT_SHIFT(x,shft)	((x) >> (shft))
 #endif
 
-/* Constants for SSE2 support */
-
-#ifdef GFX_SSE2_POSSIBLE
-GFX_SSE2_ALIGN const short color_constants_1[8] = {128, 128, 128, 128, 128, 128, 128, 128};
-GFX_SSE2_ALIGN const short color_constants_2[8] = {359, 359, 359, 359, 359, 359, 359, 359};
-GFX_SSE2_ALIGN const short color_constants_3[8] = {88, 88, 88, 88, 88, 88, 88, 88};
-GFX_SSE2_ALIGN const short color_constants_4[8] = {183, 183, 183, 183, 183, 183, 183, 183};
-GFX_SSE2_ALIGN const short color_constants_5[8] = {454, 454, 454, 454, 454, 454, 454, 454};
-GFX_SSE2_ALIGN const short color_constants_6[8] = {11382, 11382, 11382, 11382, 11382, 11382, 11382, 11382};
-#endif /* GFX_SSE2_POSSIBLE */
 
 METHODDEF(void)
 ycc_rgb_convert_argb (j_decompress_ptr cinfo,
                  JSAMPIMAGE input_buf, JDIMENSION input_row,
                  JSAMPARRAY output_buf, int num_rows)
 {
   JDIMENSION num_cols = cinfo->output_width;
   JSAMPLE * range_limit = cinfo->sample_range_limit;
 
-/* Locals for SSE2 support */
-
-#ifdef GFX_SSE2_POSSIBLE
-  JSAMPROW inptr0, inptr1, inptr2, outptr;
-  JDIMENSION col;
-  __m128i row_Y, row_cb, row_cr, zeroes, ones;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
-  __m128i row_B, row_G, row_R;
-#endif /* GFX_SSE2_POSSIBLE */
-
   SHIFT_TEMPS
 
-#ifdef GFX_SSE2_POSSIBLE
-  if (GFX_SSE2_AVAILABLE == 1) {
-
-    /* Load registers with constants if we're going to do at least one quad */
-    
-    if (num_cols >= 8) {
-      zeroes = _mm_setzero_si128();
-      ones = _mm_cmpeq_epi8(zeroes, zeroes);
-    }
-    
-    while (--num_rows >= 0) {
-      inptr0 = input_buf[0][input_row];
-      inptr1 = input_buf[1][input_row];
-      inptr2 = input_buf[2][input_row];
-      input_row++;
-      outptr = *output_buf++;
-      
-      /* YCbCr -> RGB using scaled multiplication */
-      
-      for (col = 0; col + 8 <= num_cols; col += 8) {
-        
-        /* Read a row of each color component */
-
-        row_Y  = _mm_loadl_epi64((__m128i *) (inptr0 + col));
-        row_cb = _mm_loadl_epi64((__m128i *) inptr1);
-        row_cr = _mm_loadl_epi64((__m128i *) inptr2);
-        
-        /* Convert bytes to words */
-        
-        row_Y  = _mm_unpacklo_epi8(row_Y, zeroes);
-        row_cb = _mm_unpacklo_epi8(row_cb, zeroes);
-        row_cr = _mm_unpacklo_epi8(row_cr, zeroes);
-
-        /* Compute B and R */
-
-        tmp0 = _mm_sub_epi16(row_cb, *((__m128i *) color_constants_1));    /* cb - 128 */
-        tmp1 = _mm_sub_epi16(row_cr, *((__m128i *) color_constants_1));    /* cr - 128 */
-        tmp2 = _mm_slli_epi16(tmp0, 8);                                    /* (cb - 128) << 8 */
-        tmp3 = _mm_slli_epi16(tmp1, 8);                                    /* (cr - 128) << 8 */
-        tmp2 = _mm_mulhi_epi16(tmp2, *((__m128i *) color_constants_5));    /* (454 * ((cb - 128) << 8)) >> 16 */
-        tmp3 = _mm_mulhi_epi16(tmp3, *((__m128i *) color_constants_2));    /* (359 * ((cr - 128) << 8)) >> 16 */
-        row_B = _mm_add_epi16(tmp2, row_Y);                                /* B = y + (454 * ((cb - 128) << 8)) >> 16 */
-        row_R = _mm_add_epi16(tmp3, row_Y);                                /* R = y + (359 * ((cr - 128) << 8)) >> 16 */
-
-        /* Compute G */
-
-        tmp4 = _mm_mullo_epi16(row_cb, *((__m128i *) color_constants_3));    /* 88 * cb */
-        tmp5 = _mm_mullo_epi16(tmp1, *((__m128i *) color_constants_4));      /* 183 * (cr - 128) */
-        tmp4 = _mm_sub_epi16(tmp4, *((__m128i *) color_constants_6));        /* 88 * cb - (34806 - 128*256) */
-        tmp4 = _mm_add_epi16(tmp4, tmp5);                                    /* (88 * cb) + 183 * (cr - 128) - (34806 - 128*256) */
-        tmp4 = _mm_srai_epi16(tmp4, 8);                                      /* Shift the above right by 8 to divide by 256 */
-        row_G = _mm_sub_epi16(row_Y, tmp4);                                  /* G */
-
-        /* Pack word to byte with unsigned saturation which range limits too */
-
-        row_R = _mm_packus_epi16(row_R, zeroes);
-        row_G = _mm_packus_epi16(row_G, zeroes);
-        row_B = _mm_packus_epi16(row_B, zeroes);
+  /* This is used if we don't have SSE2 */
 
-        /* Pack horizontally */
-
-        tmp0 = _mm_unpacklo_epi8(row_B, row_G);          /* ABAB ABAB ABAB ABAB        */
-        tmp1 = _mm_unpacklo_epi8(row_R, ones);           /* GRGR GRGR GRGR GRGR        */
-        tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);           /* ARGB ARGB ARGB ARGB (low)  */
-        tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);           /* ARGB ARGB ARGB ARGB (high) */
-
-        /* Write out 32 bytes of ARGB */
-        
-        _mm_storel_epi64((__m128i *) outptr, tmp2);
-        _mm_storel_epi64((__m128i *) (outptr + 8), _mm_srli_si128(tmp2, 8));
-        _mm_storel_epi64((__m128i *) (outptr + 16), tmp3);
-        _mm_storel_epi64((__m128i *) (outptr + 24), _mm_srli_si128(tmp3, 8));
-        
-        /* Slide the pointers */
-        
-        /* inptr0 += 8; */
-        inptr1 += 8;
-        inptr2 += 8;
-        outptr += 32;
-      }
-
-      /* Process remainder using scalar code */
-      for (; col < num_cols; col++) {
-        int y  = GETJSAMPLE(inptr0[col]);
-        int cb = GETJSAMPLE(inptr1[0]);
-        int cr = GETJSAMPLE(inptr2[0]);
-        inptr1++;
-        inptr2++;
-        JSAMPLE * range_limit_y = range_limit + y;
-        /* Range-limiting is essential due to noise introduced by DCT losses. */
-        *((PRUint32 *) outptr) = 0xFF000000 |
-                                 ( range_limit_y[Cr_r_tab[cr]] << 16 ) |
-                                 ( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) |
-                                 ( range_limit_y[Cb_b_tab[cb]] );
-        outptr += 4;
-      }
-    }
-
-  } else 
-#endif /* GFX_SSE2_POSSIBLE */
-
-  /* This is used if we don't have SSE2 */
-  {
-    while (--num_rows >= 0) {
-      JSAMPROW inptr0 = input_buf[0][input_row];
-      JSAMPROW inptr1 = input_buf[1][input_row];
-      JSAMPROW inptr2 = input_buf[2][input_row];
-      input_row++;
-      PRUint32 *outptr = (PRUint32 *) *output_buf++;
-      for (JDIMENSION col = 0; col < num_cols; col++) {
-        int y  = GETJSAMPLE(inptr0[col]);
-        int cb = GETJSAMPLE(inptr1[col]);
-        int cr = GETJSAMPLE(inptr2[col]);
-        JSAMPLE * range_limit_y = range_limit + y;
-        /* Range-limiting is essential due to noise introduced by DCT losses. */
-        outptr[col] = 0xFF000000 |
-          ( range_limit_y[Cr_r_tab[cr]] << 16 ) |
-          ( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) |
-          ( range_limit_y[Cb_b_tab[cb]] );
-      }
+  while (--num_rows >= 0) {
+    JSAMPROW inptr0 = input_buf[0][input_row];
+    JSAMPROW inptr1 = input_buf[1][input_row];
+    JSAMPROW inptr2 = input_buf[2][input_row];
+    input_row++;
+    PRUint32 *outptr = (PRUint32 *) *output_buf++;
+    for (JDIMENSION col = 0; col < num_cols; col++) {
+      int y  = GETJSAMPLE(inptr0[col]);
+      int cb = GETJSAMPLE(inptr1[col]);
+      int cr = GETJSAMPLE(inptr2[col]);
+      JSAMPLE * range_limit_y = range_limit + y;
+      /* Range-limiting is essential due to noise introduced by DCT losses. */
+      outptr[col] = 0xFF000000 |
+                    ( range_limit_y[Cr_r_tab[cr]] << 16 ) |
+                    ( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) |
+                    ( range_limit_y[Cb_b_tab[cb]] );
     }
   }
 }
 
+
 /**************** Inverted CMYK -> RGB conversion **************/
 /*
  * Input is (Inverted) CMYK stored as 4 bytes per pixel.
  * Output is RGB stored as 3 bytes per pixel.
  * @param row Points to row buffer containing the CMYK bytes for each pixel in the row.
  * @param width Number of pixels in the row.
  */
 static void cmyk_convert_rgb(JSAMPROW row, JDIMENSION width)