Bug 1304537 - Update libjpeg-turbo to version 1.5.1. r=jrmuizel, a=ritu
authorRyan VanderMeulen <ryanvm@gmail.com>
Thu, 22 Sep 2016 11:46:52 -0400
changeset 355828 df2427d0d9fe8d48ff24924b5bb915cfe3d20522
parent 355827 35790cfb2c763f26e2271faa20e6b73d75620b2f
child 355829 4699c72738236cec91988f27dd47eabe2c871fd5
push id6570
push userraliiev@mozilla.com
push dateMon, 14 Nov 2016 12:26:13 +0000
treeherdermozilla-beta@f455459b2ae5 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel, ritu
bugs1304537
milestone51.0a2
Bug 1304537 - Update libjpeg-turbo to version 1.5.1. r=jrmuizel, a=ritu
media/libjpeg/MOZCHANGES
media/libjpeg/jconfig.h
media/libjpeg/jconfigint.h
media/libjpeg/jdarith.c
media/libjpeg/jdhuff.c
media/libjpeg/jdmaster.c
media/libjpeg/jdphuff.c
media/libjpeg/jdsample.c
media/libjpeg/jmemmgr.c
media/libjpeg/jpegint.h
media/libjpeg/mozilla.diff
media/libjpeg/simd/jsimd_arm.c
media/libjpeg/simd/jsimd_arm64.c
media/libjpeg/simd/jsimd_arm64_neon.S
media/libjpeg/simd/jsimd_mips.c
media/libjpeg/simd/jsimd_powerpc.c
--- a/media/libjpeg/MOZCHANGES
+++ b/media/libjpeg/MOZCHANGES
@@ -43,16 +43,20 @@ To upgrade to a new revision of libjpeg-
 * Update jconfig.h and jconfigint.h as noted previously.
 
 * Update moz.build to build any new files.
 
 * Finally, tell hg that we've added or removed some files:
 
     $ hg addremove
 
+== September 22, 2016 (libjpeg-turbo v1.5.1 cb88e5da8003afcdc443b787fdcb77285e5a8a02 2016-09-20) ==
+
+* Updated to v1.5.1 release.
+
 == June 23, 2016 (libjpeg-turbo v1.5.0 3ff13e651bbe6de9c6f15d05235d1d4f26f63ffc 2016-05-31) ==
 
 * Updated to v1.5.0 release.
 
 == October 5, 2015 (libjpeg-turbo v1.4.2 d8da49effe6460d55239c4c009c57f42d8e4a494 2015-09-21) ==
 
 * Updated to v1.4.2 release.
 
--- a/media/libjpeg/jconfig.h
+++ b/media/libjpeg/jconfig.h
@@ -1,16 +1,16 @@
 /* jconfig.h.  Generated from jconfig.h.in by configure, then manually edited
    for Mozilla. */
 
 /* Export libjpeg v6.2's ABI. */
 #define JPEG_LIB_VERSION 62
 
 /* libjpeg-turbo version */
-#define LIBJPEG_TURBO_VERSION 1.4.0
+#define LIBJPEG_TURBO_VERSION 1.5.1
 
 /* Support arithmetic encoding */
 /*#undef C_ARITH_CODING_SUPPORTED */
 
 /* Support arithmetic decoding */
 /*#undef D_ARITH_CODING_SUPPORTED */
 
 /*
--- a/media/libjpeg/jconfigint.h
+++ b/media/libjpeg/jconfigint.h
@@ -1,7 +1,7 @@
-#define VERSION "1.4.0"
-#define BUILD "2015-01-07"
+#define VERSION "1.5.1"
+#define BUILD "2016-09-20"
 #define PACKAGE_NAME "libjpeg-turbo"
 
 /* Need to use Mozilla-specific function inlining. */
 #include "mozilla/Attributes.h"
 #define INLINE MOZ_ALWAYS_INLINE
--- a/media/libjpeg/jdarith.c
+++ b/media/libjpeg/jdarith.c
@@ -1,15 +1,15 @@
 /*
  * jdarith.c
  *
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2015 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015-2016, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains portable arithmetic entropy decoding routines for JPEG
  * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
  *
  * Both sequential and progressive modes are supported in this single module.
  *
@@ -377,17 +377,17 @@ decode_mcu_AC_first (j_decompress_ptr ci
     }
     v = m;
     /* Figure F.24: Decoding the magnitude bit pattern of v */
     st += 14;
     while (m >>= 1)
       if (arith_decode(cinfo, st)) v |= m;
     v += 1; if (sign) v = -v;
     /* Scale and output coefficient in natural (dezigzagged) order */
-    (*block)[jpeg_natural_order[k]] = (JCOEF) (v << cinfo->Al);
+    (*block)[jpeg_natural_order[k]] = (JCOEF) ((unsigned)v << cinfo->Al);
   }
 
   return TRUE;
 }
 
 
 /*
  * MCU decoding for DC successive approximation refinement scan.
--- a/media/libjpeg/jdhuff.c
+++ b/media/libjpeg/jdhuff.c
@@ -104,19 +104,19 @@ start_pass_huff_decoder (j_decompress_pt
     WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     dctbl = compptr->dc_tbl_no;
     actbl = compptr->ac_tbl_no;
     /* Compute derived values for Huffman tables */
     /* We may do this more than once for a table, but it's not expensive */
-    pdtbl = entropy->dc_derived_tbls + dctbl;
+    pdtbl = (d_derived_tbl **)(entropy->dc_derived_tbls) + dctbl;
     jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl, pdtbl);
-    pdtbl = entropy->ac_derived_tbls + actbl;
+    pdtbl = (d_derived_tbl **)(entropy->ac_derived_tbls) + actbl;
     jpeg_make_d_derived_tbl(cinfo, FALSE, actbl, pdtbl);
     /* Initialize DC predictions to 0 */
     entropy->saved.last_dc_val[ci] = 0;
   }
 
   /* Precalculate decoding info for each block in an MCU of this scan */
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     ci = cinfo->MCU_membership[blkn];
--- a/media/libjpeg/jdmaster.c
+++ b/media/libjpeg/jdmaster.c
@@ -17,16 +17,17 @@
  * pass.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jpegcomp.h"
 #include "jdmaster.h"
+#include "jsimd.h"
 
 
 /*
  * Determine whether merged upsample/color conversion should be used.
  * CRUCIAL: this must match the actual capabilities of jdmerge.c!
  */
 
 LOCAL(boolean)
@@ -64,16 +65,27 @@ use_merged_upsample (j_decompress_ptr ci
       cinfo->comp_info[1].v_samp_factor != 1 ||
       cinfo->comp_info[2].v_samp_factor != 1)
     return FALSE;
   /* furthermore, it doesn't work if we've scaled the IDCTs differently */
   if (cinfo->comp_info[0]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
       cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
       cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
     return FALSE;
+#ifdef WITH_SIMD
+  /* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
+     isn't, then disabling merged upsampling is likely to be faster when
+     decompressing YCbCr JPEG images. */
+  if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
+      jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
+      (cinfo->out_color_space == JCS_RGB ||
+       (cinfo->out_color_space >= JCS_EXT_RGB &&
+        cinfo->out_color_space <= JCS_EXT_ARGB)))
+    return FALSE;
+#endif
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;                  /* by golly, it'll work... */
 #else
   return FALSE;
 #endif
 }
 
 
--- a/media/libjpeg/jdphuff.c
+++ b/media/libjpeg/jdphuff.c
@@ -1,15 +1,15 @@
 /*
  * jdphuff.c
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015-2016, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains Huffman entropy decoding routines for progressive JPEG.
  *
  * Much of the complexity here has to do with supporting input suspension.
  * If the data source module demands suspension, we want to be able to back
  * up to the start of the current MCU.  To do this, we copy state variables
@@ -165,22 +165,22 @@ start_pass_phuff_decoder (j_decompress_p
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     /* Make sure requested tables are present, and compute derived tables.
      * We may build same derived table more than once, but it's not expensive.
      */
     if (is_DC_band) {
       if (cinfo->Ah == 0) {     /* DC refinement needs no table */
         tbl = compptr->dc_tbl_no;
-        pdtbl = entropy->derived_tbls + tbl;
+        pdtbl = (d_derived_tbl **)(entropy->derived_tbls) + tbl;
         jpeg_make_d_derived_tbl(cinfo, TRUE, tbl, pdtbl);
       }
     } else {
       tbl = compptr->ac_tbl_no;
-      pdtbl = entropy->derived_tbls + tbl;
+      pdtbl = (d_derived_tbl **)(entropy->derived_tbls) + tbl;
       jpeg_make_d_derived_tbl(cinfo, FALSE, tbl, pdtbl);
       /* remember the single active table */
       entropy->ac_derived_tbl = entropy->derived_tbls[tbl];
     }
     /* Initialize DC predictions to 0 */
     entropy->saved.last_dc_val[ci] = 0;
   }
 
--- a/media/libjpeg/jdsample.c
+++ b/media/libjpeg/jdsample.c
@@ -299,16 +299,58 @@ h2v1_fancy_upsample (j_decompress_ptr ci
     invalue = GETJSAMPLE(*inptr);
     *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
     *outptr++ = (JSAMPLE) invalue;
   }
 }
 
 
 /*
+ * Fancy processing for 1:1 horizontal and 2:1 vertical (4:4:0 subsampling).
+ *
+ * This is a less common case, but it can be encountered when losslessly
+ * rotating/transposing a JPEG file that uses 4:2:2 chroma subsampling.
+ */
+
+METHODDEF(void)
+h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, outptr;
+#if BITS_IN_JSAMPLE == 8
+  int thiscolsum;
+#else
+  JLONG thiscolsum;
+#endif
+  JDIMENSION colctr;
+  int inrow, outrow, v;
+
+  inrow = outrow = 0;
+  while (outrow < cinfo->max_v_samp_factor) {
+    for (v = 0; v < 2; v++) {
+      /* inptr0 points to nearest input row, inptr1 points to next nearest */
+      inptr0 = input_data[inrow];
+      if (v == 0)               /* next nearest is row above */
+        inptr1 = input_data[inrow-1];
+      else                      /* next nearest is row below */
+        inptr1 = input_data[inrow+1];
+      outptr = output_data[outrow++];
+
+      for(colctr = 0; colctr < compptr->downsampled_width; colctr++) {
+        thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        *outptr++ = (JSAMPLE) ((thiscolsum + 1) >> 2);
+      }
+    }
+    inrow++;
+  }
+}
+
+
+/*
  * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  * Again a triangle filter; see comments for h2v1 case, above.
  *
  * It is OK for us to reference the adjacent input rows because we demanded
  * context from the main buffer controller (see initialization code).
  */
 
 METHODDEF(void)
@@ -426,16 +468,21 @@ jinit_upsampler (j_decompress_ptr cinfo)
         else
           upsample->methods[ci] = h2v1_fancy_upsample;
       } else {
         if (jsimd_can_h2v1_upsample())
           upsample->methods[ci] = jsimd_h2v1_upsample;
         else
           upsample->methods[ci] = h2v1_upsample;
       }
+    } else if (h_in_group == h_out_group &&
+               v_in_group * 2 == v_out_group && do_fancy) {
+      /* Non-fancy upsampling is handled by the generic method */
+      upsample->methods[ci] = h1v2_fancy_upsample;
+      upsample->pub.need_context_rows = TRUE;
     } else if (h_in_group * 2 == h_out_group &&
                v_in_group * 2 == v_out_group) {
       /* Special cases for 2h2v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
         if (jsimd_can_h2v2_fancy_upsample())
           upsample->methods[ci] = jsimd_h2v2_fancy_upsample;
         else
           upsample->methods[ci] = h2v2_fancy_upsample;
--- a/media/libjpeg/jmemmgr.c
+++ b/media/libjpeg/jmemmgr.c
@@ -27,16 +27,18 @@
  * memory then you shouldn't care about a little bit of unused code...)
  */
 
 #define JPEG_INTERNALS
 #define AM_MEMORY_MANAGER       /* we define jvirt_Xarray_control structs */
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jmemsys.h"            /* import the system-dependent declarations */
+#include <stdint.h>
+#include <limits.h>             /* some NDKs define SIZE_MAX in limits.h */
 
 #ifndef NO_GETENV
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare getenv() */
 extern char *getenv (const char *name);
 #endif
 #endif
 
 
@@ -645,28 +647,36 @@ realize_virt_arrays (j_common_ptr cinfo)
   /* Compute the minimum space needed (maxaccess rows in each buffer)
    * and the maximum space needed (full image height in each buffer).
    * These may be of use to the system-dependent jpeg_mem_available routine.
    */
   space_per_minheight = 0;
   maximum_space = 0;
   for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
+      size_t new_space = (long) sptr->rows_in_array *
+                         (long) sptr->samplesperrow * sizeof(JSAMPLE);
+
       space_per_minheight += (long) sptr->maxaccess *
                              (long) sptr->samplesperrow * sizeof(JSAMPLE);
-      maximum_space += (long) sptr->rows_in_array *
-                       (long) sptr->samplesperrow * sizeof(JSAMPLE);
+      if (SIZE_MAX - maximum_space < new_space)
+        out_of_memory(cinfo, 10);
+      maximum_space += new_space;
     }
   }
   for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
     if (bptr->mem_buffer == NULL) { /* if not realized yet */
+      size_t new_space = (long) bptr->rows_in_array *
+                         (long) bptr->blocksperrow * sizeof(JBLOCK);
+
       space_per_minheight += (long) bptr->maxaccess *
                              (long) bptr->blocksperrow * sizeof(JBLOCK);
-      maximum_space += (long) bptr->rows_in_array *
-                       (long) bptr->blocksperrow * sizeof(JBLOCK);
+      if (SIZE_MAX - maximum_space < new_space)
+        out_of_memory(cinfo, 11);
+      maximum_space += new_space;
     }
   }
 
   if (space_per_minheight <= 0)
     return;                     /* no unrealized arrays, no work */
 
   /* Determine amount of memory to actually use; this is system-dependent. */
   avail_mem = jpeg_mem_available(cinfo, space_per_minheight, maximum_space,
--- a/media/libjpeg/jpegint.h
+++ b/media/libjpeg/jpegint.h
@@ -150,18 +150,18 @@ struct jpeg_decomp_master {
   void (*finish_output_pass) (j_decompress_ptr cinfo);
 
   /* State variables made visible to other modules */
   boolean is_dummy_pass;        /* True during 1st pass for 2-pass quant */
 
   /* Partial decompression variables */
   JDIMENSION first_iMCU_col;
   JDIMENSION last_iMCU_col;
-  JDIMENSION first_MCU_col[MAX_COMPS_IN_SCAN];
-  JDIMENSION last_MCU_col[MAX_COMPS_IN_SCAN];
+  JDIMENSION first_MCU_col[MAX_COMPONENTS];
+  JDIMENSION last_MCU_col[MAX_COMPONENTS];
   boolean jinit_upsampler_no_alloc;
 };
 
 /* Input control module */
 struct jpeg_input_controller {
   int (*consume_input) (j_decompress_ptr cinfo);
   void (*reset_input_controller) (j_decompress_ptr cinfo);
   void (*start_input_pass) (j_decompress_ptr cinfo);
--- a/media/libjpeg/mozilla.diff
+++ b/media/libjpeg/mozilla.diff
@@ -1,8 +1,29 @@
+diff --git jmemmgr.c jmemmgr.c
+--- jmemmgr.c
++++ jmemmgr.c
+@@ -28,16 +28,17 @@
+  */
+ 
+ #define JPEG_INTERNALS
+ #define AM_MEMORY_MANAGER       /* we define jvirt_Xarray_control structs */
+ #include "jinclude.h"
+ #include "jpeglib.h"
+ #include "jmemsys.h"            /* import the system-dependent declarations */
+ #include <stdint.h>
++#include <limits.h>             /* some NDKs define SIZE_MAX in limits.h */
+ 
+ #ifndef NO_GETENV
+ #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare getenv() */
+ extern char *getenv (const char *name);
+ #endif
+ #endif
+
+
 diff --git jmorecfg.h jmorecfg.h
 --- jmorecfg.h
 +++ jmorecfg.h
 @@ -9,16 +9,17 @@
   * For conditions of distribution and use, see the accompanying README.ijg
   * file.
   *
   * This file contains additional configuration options that customize the
--- a/media/libjpeg/simd/jsimd_arm.c
+++ b/media/libjpeg/simd/jsimd_arm.c
@@ -120,17 +120,17 @@ init_simd (void)
     if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
       break;
   }
 #endif
 
   /* Force different settings through environment variables */
   env = getenv("JSIMD_FORCENEON");
   if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_ARM_NEON;
+    simd_support = JSIMD_ARM_NEON;
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
   env = getenv("JSIMD_NOHUFFENC");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_huffman = 0;
 }
 
--- a/media/libjpeg/simd/jsimd_arm64.c
+++ b/media/libjpeg/simd/jsimd_arm64.c
@@ -137,17 +137,17 @@ init_simd (void)
     if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
       break;
   }
 #endif
 
   /* Force different settings through environment variables */
   env = getenv("JSIMD_FORCENEON");
   if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_ARM_NEON;
+    simd_support = JSIMD_ARM_NEON;
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
   env = getenv("JSIMD_NOHUFFENC");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_huffman = 0;
   env = getenv("JSIMD_FASTLD3");
   if ((env != NULL) && (strcmp(env, "1") == 0))
--- a/media/libjpeg/simd/jsimd_arm64_neon.S
+++ b/media/libjpeg/simd/jsimd_arm64_neon.S
@@ -205,20 +205,26 @@ asm_function jsimd_idct_islow_neon
     TMP2            .req x1
     TMP3            .req x9
     TMP4            .req x10
     TMP5            .req x11
     TMP6            .req x12
     TMP7            .req x13
     TMP8            .req x14
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     sub             sp, sp, #64
     adr             x15, Ljsimd_idct_islow_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
     ld1             {v0.8h, v1.8h}, [x15]
     ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
     ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
     ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
     ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
 
     cmeq            v16.8h, v3.8h, #0
     cmeq            v26.8h, v4.8h, #0
@@ -233,17 +239,16 @@ asm_function jsimd_idct_islow_neon
     and             v12.16b, v29.16b, v30.16b
     and             v13.16b, v31.16b, v10.16b
     and             v14.16b, v11.16b, v12.16b
     mul             v2.8h, v2.8h, v18.8h
     and             v15.16b, v13.16b, v14.16b
     shl             v10.8h, v2.8h, #(PASS1_BITS)
     sqxtn           v16.8b, v15.8h
     mov             TMP1, v16.d[0]
-    sub             sp, sp, #64
     mvn             TMP2, TMP1
 
     cbnz            TMP2, 2f
     /* case all AC coeffs are zeros */
     dup             v2.2d, v10.d[0]
     dup             v6.2d, v10.d[1]
     mov             v3.16b, v2.16b
     mov             v7.16b, v6.16b
@@ -802,16 +807,21 @@ asm_function jsimd_idct_ifast_neon
     TMP2            .req x1
     TMP3            .req x9
     TMP4            .req x10
     TMP5            .req x11
     TMP6            .req x12
     TMP7            .req x13
     TMP8            .req x14
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     /* Load and dequantize coefficients into NEON registers
      * with the following allocation:
      *       0 1 2 3 | 4 5 6 7
      *      ---------+--------
      *   0 | d16     | d17     ( v16.8h )
      *   1 | d18     | d19     ( v17.8h )
      *   2 | d20     | d21     ( v18.8h )
      *   3 | d22     | d23     ( v19.8h )
@@ -1096,29 +1106,28 @@ asm_function jsimd_idct_4x4_neon
     COEF_BLOCK      .req x1
     OUTPUT_BUF      .req x2
     OUTPUT_COL      .req x3
     TMP1            .req x0
     TMP2            .req x1
     TMP3            .req x2
     TMP4            .req x15
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     /* Save all used NEON registers */
-    sub             sp, sp, 272
-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
     /* Load constants (v3.4h is just used for padding) */
     adr             TMP4, Ljsimd_idct_4x4_neon_consts
-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
      *       0 1 2 3 | 4 5 6 7
      *      ---------+--------
      *   0 | v4.4h   | v5.4h
      *   1 | v6.4h   | v7.4h
      *   2 | v8.4h   | v9.4h
@@ -1217,26 +1226,18 @@ asm_function jsimd_idct_4x4_neon
     st1             {v27.b}[5], [TMP4], 1
     st1             {v26.b}[6], [TMP2], 1
     st1             {v27.b}[6], [TMP4], 1
     st1             {v26.b}[7], [TMP2], 1
     st1             {v27.b}[7], [TMP4], 1
 #endif
 
     /* vpop            {v8.4h - v15.4h}    ;not available */
-    sub             sp, sp, #272
-    ldr             x15, [sp], 16
-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     blr             x30
 
     .unreq          DCT_TABLE
     .unreq          COEF_BLOCK
     .unreq          OUTPUT_BUF
     .unreq          OUTPUT_COL
     .unreq          TMP1
     .unreq          TMP2
@@ -1294,29 +1295,29 @@ asm_function jsimd_idct_2x2_neon
 
     DCT_TABLE       .req x0
     COEF_BLOCK      .req x1
     OUTPUT_BUF      .req x2
     OUTPUT_COL      .req x3
     TMP1            .req x0
     TMP2            .req x15
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     /* vpush           {v8.4h - v15.4h}            ; not available */
-    sub             sp, sp, 208
-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
 
     /* Load constants */
     adr             TMP2, Ljsimd_idct_2x2_neon_consts
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v21.8b, v22.8b}, [sp], 16
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v30.8b, v31.8b}, [sp], 16
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v14.4h}, [TMP2]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
      *       0 1 2 3 | 4 5 6 7
      *      ---------+--------
      *   0 | v4.4h   | v5.4h
      *   1 | v6.4h   | v7.4h
      *   2 | -       | -
@@ -1406,25 +1407,18 @@ asm_function jsimd_idct_2x2_neon
     add             TMP1, TMP1, OUTPUT_COL
     add             TMP2, TMP2, OUTPUT_COL
 
     st1             {v26.b}[0], [TMP1], 1
     st1             {v27.b}[4], [TMP1], 1
     st1             {v26.b}[1], [TMP2], 1
     st1             {v27.b}[5], [TMP2], 1
 
-    sub             sp, sp, #208
-    ldr             x15, [sp], 16
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v21.8b, v22.8b}, [sp], 16
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v30.8b, v31.8b}, [sp], 16
     blr             x30
 
     .unreq          DCT_TABLE
     .unreq          COEF_BLOCK
     .unreq          OUTPUT_BUF
     .unreq          OUTPUT_COL
     .unreq          TMP1
     .unreq          TMP2
@@ -1683,78 +1677,65 @@ Ljsimd_ycc_\colorid\()_neon_slowst3_cons
   .short -128,  -128,   -128,   -128
   .short -128,  -128,   -128,   -128
 
 .if \fast_st3 == 1
 asm_function jsimd_ycc_\colorid\()_convert_neon
 .else
 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
 .endif
-    OUTPUT_WIDTH    .req x0
+    OUTPUT_WIDTH    .req w0
     INPUT_BUF       .req x1
-    INPUT_ROW       .req x2
+    INPUT_ROW       .req w2
     OUTPUT_BUF      .req x3
-    NUM_ROWS        .req x4
+    NUM_ROWS        .req w4
 
     INPUT_BUF0      .req x5
     INPUT_BUF1      .req x6
     INPUT_BUF2      .req x1
 
     RGB             .req x7
-    Y               .req x8
-    U               .req x9
-    V               .req x10
-    N               .req x15
-
-    sub             sp, sp, 336
-    str             x15, [sp], 16
+    Y               .req x9
+    U               .req x10
+    V               .req x11
+    N               .req w15
+
+    sub             sp, sp, 64
+    mov             x9, sp
 
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
     .if \fast_st3 == 1
       adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
     .else
       adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
     .endif
 
     /* Save NEON registers */
-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h}, [x15], 16
     ld1             {v2.8h}, [x15]
 
-    /* Save ARM registers and handle input arguments */
-    /* push            {x4, x5, x6, x7, x8, x9, x10, x30} */
-    stp             x4, x5, [sp], 16
-    stp             x6, x7, [sp], 16
-    stp             x8, x9, [sp], 16
-    stp             x10, x30, [sp], 16
     ldr             INPUT_BUF0, [INPUT_BUF]
     ldr             INPUT_BUF1, [INPUT_BUF, #8]
     ldr             INPUT_BUF2, [INPUT_BUF, #16]
     .unreq          INPUT_BUF
 
     /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
     movi            v10.16b, #255
     movi            v13.16b, #255
 
     /* Outer loop over scanlines */
     cmp             NUM_ROWS, #1
     b.lt            9f
 0:
-    lsl             x16, INPUT_ROW, #3
-    ldr             Y, [INPUT_BUF0, x16]
-    ldr             U, [INPUT_BUF1, x16]
+    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
+    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
     mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, x16]
+    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
     add             INPUT_ROW, INPUT_ROW, #1
     ldr             RGB, [OUTPUT_BUF], #8
 
     /* Inner loop over pixels */
     subs            N, N, #8
     b.lt            3f
     do_load         8
     do_yuv_to_rgb_stage1
@@ -1794,31 +1775,18 @@ 7:
     tst             N, #1
     b.eq            8f
     do_store        \bpp, 1, \fast_st3
 8:
     subs            NUM_ROWS, NUM_ROWS, #1
     b.gt            0b
 9:
     /* Restore all registers and return */
-    sub             sp, sp, #336
-    ldr             x15, [sp], 16
-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
-    /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
-    ldp             x4, x5, [sp], 16
-    ldp             x6, x7, [sp], 16
-    ldp             x8, x9, [sp], 16
-    ldp             x10, x30, [sp], 16
     br              x30
     .unreq          OUTPUT_WIDTH
     .unreq          INPUT_ROW
     .unreq          OUTPUT_BUF
     .unreq          NUM_ROWS
     .unreq          INPUT_BUF0
     .unreq          INPUT_BUF1
     .unreq          INPUT_BUF2
@@ -2049,18 +2017,18 @@ Ljsimd_\colorid\()_ycc_neon_slowld3_cons
 .if \fast_ld3 == 1
 asm_function jsimd_\colorid\()_ycc_convert_neon
 .else
 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
 .endif
     OUTPUT_WIDTH    .req w0
     INPUT_BUF       .req x1
     OUTPUT_BUF      .req x2
-    OUTPUT_ROW      .req x3
-    NUM_ROWS        .req x4
+    OUTPUT_ROW      .req w3
+    NUM_ROWS        .req w4
 
     OUTPUT_BUF0     .req x5
     OUTPUT_BUF1     .req x6
     OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
 
     RGB             .req x7
     Y               .req x9
     U               .req x10
@@ -2077,27 +2045,28 @@ asm_function jsimd_\colorid\()_ycc_conve
 
     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
     ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
     ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
     .unreq          OUTPUT_BUF
 
     /* Save NEON registers */
     sub             sp, sp, #64
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    mov             x9, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
 
     /* Outer loop over scanlines */
     cmp             NUM_ROWS, #1
     b.lt            9f
 0:
-    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
-    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
     mov             N, OUTPUT_WIDTH
-    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
     add             OUTPUT_ROW, OUTPUT_ROW, #1
     ldr             RGB, [INPUT_BUF], #8
 
     /* Inner loop over pixels */
     subs            N, N, #8
     b.lt            3f
     do_load         \bpp, 8, \fast_ld3
     do_rgb_to_yuv_stage1
@@ -2131,17 +2100,16 @@ 6:
 7:
     tbz             N, #0, 8f
     do_store        1
 8:
     subs            NUM_ROWS, NUM_ROWS, #1
     b.gt            0b
 9:
     /* Restore all registers and return */
-    sub             sp, sp, #64
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
     br              x30
 
     .unreq          OUTPUT_WIDTH
     .unreq          OUTPUT_ROW
     .unreq          INPUT_BUF
     .unreq          NUM_ROWS
@@ -2194,16 +2162,21 @@ asm_function jsimd_convsamp_neon
     TMP3            .req x11
     TMP4            .req x12
     TMP5            .req x13
     TMP6            .req x14
     TMP7            .req x15
     TMP8            .req x4
     TMPDUP          .req w3
 
+    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x1 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x1, w1
+
     mov             TMPDUP, #128
     ldp             TMP1, TMP2, [SAMPLE_DATA], 16
     ldp             TMP3, TMP4, [SAMPLE_DATA], 16
     dup             v0.8b, TMPDUP
     add             TMP1, TMP1, START_COL
     add             TMP2, TMP2, START_COL
     ldp             TMP5, TMP6, [SAMPLE_DATA], 16
     add             TMP3, TMP3, START_COL
@@ -2330,18 +2303,19 @@ asm_function jsimd_fdct_islow_neon
     TMP             .req x9
 
     /* Load constants */
     adr             TMP, Ljsimd_fdct_islow_neon_consts
     ld1             {v0.8h, v1.8h}, [TMP]
 
     /* Save NEON registers */
     sub             sp, sp, #64
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
 
     /* Load all DATA into NEON registers with the following allocation:
      *       0 1 2 3 | 4 5 6 7
      *      ---------+--------
      *   0 | d16     | d17    | v16.8h
      *   1 | d18     | d19    | v17.8h
      *   2 | d20     | d21    | v18.8h
      *   3 | d22     | d23    | v19.8h
@@ -2561,17 +2535,16 @@ asm_function jsimd_fdct_islow_neon
     rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
     rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
 
     /* store results */
     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
 
     /* Restore NEON registers */
-    sub             sp, sp, #64
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
 
     br              x30
 
     .unreq          DATA
     .unreq          TMP
 
@@ -3075,17 +3048,17 @@ Ljsimd_huff_encode_one_block_neon_slowtb
 .if \fast_tbl == 1
 asm_function jsimd_huff_encode_one_block_neon
 .else
 asm_function jsimd_huff_encode_one_block_neon_slowtbl
 .endif
     sub             sp, sp, 272
     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
     /* Save ARM registers */
-    stp             x19, x20, [sp], 16
+    stp             x19, x20, [sp]
 .if \fast_tbl == 1
     adr             x15, Ljsimd_huff_encode_one_block_neon_consts
 .else
     adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
 .endif
     ldr             PUT_BUFFER, [x0, #0x10]
     ldr             PUT_BITSw, [x0, #0x18]
     ldrsh           w12, [x2]               /* load DC coeff in w12 */
@@ -3289,17 +3262,17 @@ asm_function jsimd_huff_encode_one_block
       sub             w12, w14, #32
     xtn2            v22.16b, v17.8h
       lsr             w13, w13, w14
     and             v16.16b, v16.16b, v23.16b
       neg             w12, w12
     and             v18.16b, v18.16b, v23.16b
       add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
     and             v20.16b, v20.16b, v23.16b
-      add             x15, sp, #0x80           /* x15 = t2 */
+      add             x15, sp, #0x90           /* x15 = t2 */
     and             v22.16b, v22.16b, v23.16b
       ldr             w10, [x4, x12, lsl #2]
     addp            v16.16b, v16.16b, v18.16b
       ldrb            w11, [x3, x12]
     addp            v20.16b, v20.16b, v22.16b
       checkbuf47
     addp            v16.16b, v16.16b, v20.16b
       put_bits        x10, x11
@@ -3312,17 +3285,17 @@ asm_function jsimd_huff_encode_one_block
     addv            B18, v17.8b
       add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
     umov            w12, v18.b[0]
       lsr             x9, x9, #0x1     /* clear AC coeff */
     ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
     rbit            x9, x9             /* x9 = index0 */
     ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
     cmp             w12, #(64-8)
-    mov             x11, sp
+    add             x11, sp, #16
     b.lt            4f
     cbz             x9, 6f
     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
     st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
     st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
 1:
     clz             x2, x9
@@ -3416,25 +3389,24 @@ 3:
     lsl             x9, x9, #0x1
     ldr             w12, [x5, x2, lsl #2]
     ldrb            w10, [x4, x2]
     checkbuf31
     put_bits        x12, x10
     put_bits        x3, x11
     cbnz            x9, 1b
 6:
-    add             x13, sp, #0xfe
+    add             x13, sp, #0x10e
     cmp             x15, x13
     b.hs            1f
     ldr             w12, [x5]
     ldrb            w14, [x4]
     checkbuf47
     put_bits        x12, x14
 1:
-    sub             sp, sp, 16
     str             PUT_BUFFER, [x0, #0x10]
     str             PUT_BITSw, [x0, #0x18]
     ldp             x19, x20, [sp], 16
     add             x0, BUFFER, #0x1
     add             sp, sp, 256
     br              x30
 
 .endm
--- a/media/libjpeg/simd/jsimd_mips.c
+++ b/media/libjpeg/simd/jsimd_mips.c
@@ -1,13 +1,13 @@
 /*
  * jsimd_mips.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Matthieu Darbois.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
  *
  * This file contains the interface between the "normal" portions
@@ -72,16 +72,24 @@ init_simd (void)
   simd_support |= JSIMD_MIPS_DSPR2;
 #elif defined(__linux__)
   /* We still have a chance to use MIPS DSPR2 regardless of globally used
    * -mdspr2 options passed to gcc by performing runtime detection via
    * /proc/cpuinfo parsing on linux */
   if (!parse_proc_cpuinfo("MIPS 74K"))
     return;
 #endif
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEDSPR2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_MIPS_DSPR2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
 }
 
 static const int mips_idct_ifast_coefs[4] = {
   0x45404540,           // FIX( 1.082392200 / 2) =  17734 = 0x4546
   0x5A805A80,           // FIX( 1.414213562 / 2) =  23170 = 0x5A82
   0x76407640,           // FIX( 1.847759065 / 2) =  30274 = 0x7642
   0xAC60AC60            // FIX(-2.613125930 / 4) = -21407 = 0xAC61
 };
--- a/media/libjpeg/simd/jsimd_powerpc.c
+++ b/media/libjpeg/simd/jsimd_powerpc.c
@@ -1,13 +1,13 @@
 /*
  * jsimd_powerpc.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
  *
  * This file contains the interface between the "normal" portions
  * of the library and the SIMD implementations when running on a
@@ -17,29 +17,116 @@
 #define JPEG_INTERNALS
 #include "../jinclude.h"
 #include "../jpeglib.h"
 #include "../jsimd.h"
 #include "../jdct.h"
 #include "../jsimddct.h"
 #include "jsimd.h"
 
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
 static unsigned int simd_support = ~0;
 
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature (char *buffer, char *feature)
+{
+  char *p;
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "cpu", 3) != 0)
+    return 0;
+  buffer += 3;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo (int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "altivec"))
+        simd_support |= JSIMD_ALTIVEC;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
 LOCAL(void)
 init_simd (void)
 {
   char *env = NULL;
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
 
   if (simd_support != ~0U)
     return;
 
-  simd_support = JSIMD_ALTIVEC;
+  simd_support = 0;
+
+#if defined(__ALTIVEC__) || defined(__APPLE__)
+  simd_support |= JSIMD_ALTIVEC;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
 
   /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEALTIVEC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_ALTIVEC;
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
 }
 
 GLOBAL(int)
 jsimd_can_rgb_ycc (void)
 {