Bug 693057 - Add libvpx's encoder support (disabled), r=cpearce,khuey
authorTimothy B. Terriberry <tterribe@vt.edu>
Thu, 13 Oct 2011 17:37:34 -0700
changeset 80068 c7542ce9069aa584190e6795e549d76dcf0a07bc
parent 80067 f5eded31718ae2fd1edd6955f3ec533c4951e853
child 80069 411c5d481dc15a1dfefa4b3ab01bca95d7a74922
push idunknown
push userunknown
push dateunknown
reviewerscpearce, khuey
bugs693057
milestone10.0a1
Bug 693057 - Add libvpx's encoder support (disabled), r=cpearce,khuey
config/autoconf.mk.in
configure.in
media/libvpx/Makefile.in
media/libvpx/update.sh
media/libvpx/vp8/common/asm_com_offsets.c
media/libvpx/vp8/encoder/arm/arm_csystemdependent.c
media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
media/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm
media/libvpx/vp8/encoder/arm/dct_arm.c
media/libvpx/vp8/encoder/arm/dct_arm.h
media/libvpx/vp8/encoder/arm/encodemb_arm.h
media/libvpx/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
media/libvpx/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
media/libvpx/vp8/encoder/arm/neon/sad16_neon.asm
media/libvpx/vp8/encoder/arm/neon/sad8_neon.asm
media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
media/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
media/libvpx/vp8/encoder/arm/neon/variance_neon.asm
media/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
media/libvpx/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
media/libvpx/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
media/libvpx/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
media/libvpx/vp8/encoder/arm/picklpf_arm.c
media/libvpx/vp8/encoder/arm/quantize_arm.h
media/libvpx/vp8/encoder/arm/variance_arm.c
media/libvpx/vp8/encoder/arm/variance_arm.h
media/libvpx/vp8/encoder/asm_enc_offsets.c
media/libvpx/vp8/encoder/bitstream.c
media/libvpx/vp8/encoder/bitstream.h
media/libvpx/vp8/encoder/block.h
media/libvpx/vp8/encoder/boolhuff.c
media/libvpx/vp8/encoder/boolhuff.h
media/libvpx/vp8/encoder/dct.c
media/libvpx/vp8/encoder/dct.h
media/libvpx/vp8/encoder/encodeframe.c
media/libvpx/vp8/encoder/encodeintra.c
media/libvpx/vp8/encoder/encodeintra.h
media/libvpx/vp8/encoder/encodemb.c
media/libvpx/vp8/encoder/encodemb.h
media/libvpx/vp8/encoder/encodemv.c
media/libvpx/vp8/encoder/encodemv.h
media/libvpx/vp8/encoder/ethreading.c
media/libvpx/vp8/encoder/firstpass.c
media/libvpx/vp8/encoder/firstpass.h
media/libvpx/vp8/encoder/generic/csystemdependent.c
media/libvpx/vp8/encoder/lookahead.c
media/libvpx/vp8/encoder/lookahead.h
media/libvpx/vp8/encoder/mcomp.c
media/libvpx/vp8/encoder/mcomp.h
media/libvpx/vp8/encoder/modecosts.c
media/libvpx/vp8/encoder/modecosts.h
media/libvpx/vp8/encoder/onyx_if.c
media/libvpx/vp8/encoder/onyx_int.h
media/libvpx/vp8/encoder/pickinter.c
media/libvpx/vp8/encoder/pickinter.h
media/libvpx/vp8/encoder/picklpf.c
media/libvpx/vp8/encoder/psnr.c
media/libvpx/vp8/encoder/psnr.h
media/libvpx/vp8/encoder/quantize.c
media/libvpx/vp8/encoder/quantize.h
media/libvpx/vp8/encoder/ratectrl.c
media/libvpx/vp8/encoder/ratectrl.h
media/libvpx/vp8/encoder/rdopt.c
media/libvpx/vp8/encoder/rdopt.h
media/libvpx/vp8/encoder/sad_c.c
media/libvpx/vp8/encoder/segmentation.c
media/libvpx/vp8/encoder/segmentation.h
media/libvpx/vp8/encoder/temporal_filter.c
media/libvpx/vp8/encoder/temporal_filter.h
media/libvpx/vp8/encoder/tokenize.c
media/libvpx/vp8/encoder/tokenize.h
media/libvpx/vp8/encoder/treewriter.c
media/libvpx/vp8/encoder/treewriter.h
media/libvpx/vp8/encoder/variance.h
media/libvpx/vp8/encoder/variance_c.c
media/libvpx/vp8/encoder/x86/dct_mmx.asm
media/libvpx/vp8/encoder/x86/dct_sse2.asm
media/libvpx/vp8/encoder/x86/dct_x86.h
media/libvpx/vp8/encoder/x86/encodemb_x86.h
media/libvpx/vp8/encoder/x86/encodeopt.asm
media/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
media/libvpx/vp8/encoder/x86/mcomp_x86.h
media/libvpx/vp8/encoder/x86/quantize_mmx.asm
media/libvpx/vp8/encoder/x86/quantize_sse2.asm
media/libvpx/vp8/encoder/x86/quantize_sse4.asm
media/libvpx/vp8/encoder/x86/quantize_ssse3.asm
media/libvpx/vp8/encoder/x86/quantize_x86.h
media/libvpx/vp8/encoder/x86/sad_mmx.asm
media/libvpx/vp8/encoder/x86/sad_sse2.asm
media/libvpx/vp8/encoder/x86/sad_sse3.asm
media/libvpx/vp8/encoder/x86/sad_sse4.asm
media/libvpx/vp8/encoder/x86/sad_ssse3.asm
media/libvpx/vp8/encoder/x86/subtract_mmx.asm
media/libvpx/vp8/encoder/x86/subtract_sse2.asm
media/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
media/libvpx/vp8/encoder/x86/temporal_filter_x86.h
media/libvpx/vp8/encoder/x86/variance_impl_mmx.asm
media/libvpx/vp8/encoder/x86/variance_impl_sse2.asm
media/libvpx/vp8/encoder/x86/variance_impl_ssse3.asm
media/libvpx/vp8/encoder/x86/variance_mmx.c
media/libvpx/vp8/encoder/x86/variance_sse2.c
media/libvpx/vp8/encoder/x86/variance_ssse3.c
media/libvpx/vp8/encoder/x86/variance_x86.h
media/libvpx/vp8/encoder/x86/x86_csystemdependent.c
media/libvpx/vp8/vp8_cx_iface.c
media/libvpx/vpx_config.h
media/libvpx/vpx_ports/asm_offsets.h
media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm
--- a/config/autoconf.mk.in
+++ b/config/autoconf.mk.in
@@ -149,23 +149,25 @@ MOZ_OGG = @MOZ_OGG@
 MOZ_RAW = @MOZ_RAW@
 MOZ_SYDNEYAUDIO = @MOZ_SYDNEYAUDIO@
 MOZ_WAVE = @MOZ_WAVE@
 MOZ_MEDIA = @MOZ_MEDIA@
 MOZ_VORBIS = @MOZ_VORBIS@
 MOZ_TREMOR = @MOZ_TREMOR@
 MOZ_WEBM = @MOZ_WEBM@
 MOZ_VP8_ERROR_CONCEALMENT = @MOZ_VP8_ERROR_CONCEALMENT@
+MOZ_VP8_ENCODER = @MOZ_VP8_ENCODER@
 VPX_AS = @VPX_AS@
 VPX_ASFLAGS = @VPX_ASFLAGS@
 VPX_DASH_C_FLAG = @VPX_DASH_C_FLAG@
 VPX_AS_CONVERSION = @VPX_AS_CONVERSION@
 VPX_ASM_SUFFIX = @VPX_ASM_SUFFIX@
 VPX_X86_ASM = @VPX_X86_ASM@
 VPX_ARM_ASM = @VPX_ARM_ASM@
+VPX_NEED_OBJ_INT_EXTRACT = @VPX_NEED_OBJ_INT_EXTRACT@
 LIBJPEG_TURBO_AS = @LIBJPEG_TURBO_AS@
 LIBJPEG_TURBO_ASFLAGS = @LIBJPEG_TURBO_ASFLAGS@
 LIBJPEG_TURBO_X86_ASM = @LIBJPEG_TURBO_X86_ASM@
 LIBJPEG_TURBO_X64_ASM = @LIBJPEG_TURBO_X64_ASM@
 NS_PRINTING = @NS_PRINTING@
 MOZ_PDF_PRINTING = @MOZ_PDF_PRINTING@
 MOZ_CRASHREPORTER = @MOZ_CRASHREPORTER@
 MOZ_HELP_VIEWER = @MOZ_HELP_VIEWER@
--- a/configure.in
+++ b/configure.in
@@ -4321,16 +4321,17 @@ MOZ_OGG=1
 MOZ_RAW=
 MOZ_SYDNEYAUDIO=
 MOZ_VORBIS=
 MOZ_TREMOR=
 MOZ_WAVE=1
 MOZ_MEDIA=
 MOZ_WEBM=1
 MOZ_VP8_ERROR_CONCEALMENT=
+MOZ_VP8_ENCODER=
 VPX_AS=
 VPX_ASFLAGS=
 VPX_AS_DASH_C_FLAG=
 VPX_AS_CONVERSION=
 VPX_ASM_SUFFIX=
 VPX_X86_ASM=
 VPX_ARM_ASM=
 LIBJPEG_TURBO_AS=
@@ -5345,16 +5346,19 @@ MOZ_NATIVE_LIBVPX=
 MOZ_LIBVPX_INCLUDES=
 MOZ_LIBVPX_LIBS=
 
 if test -n "$MOZ_WEBM"; then
     AC_DEFINE(MOZ_WEBM)
     if test -n "$MOZ_VP8_ERROR_CONCEALMENT" ; then
         AC_DEFINE(MOZ_VP8_ERROR_CONCEALMENT)
     fi
+    if test -n "$MOZ_VP8_ENCODER" ; then
+        AC_DEFINE(MOZ_VP8_ENCODER)
+    fi
 
     if test -n "$LIBVPX_DIR" -a "$LIBVPX_DIR" != no; then
         _SAVE_CFLAGS=$CFLAGS
         _SAVE_LDFLAGS=$LDFLAGS
         _SAVE_LIBS=$LIBS
         if test "${LIBVPX_DIR}" = "yes"; then
             LIBVPX_DIR=/usr
         fi
@@ -5418,16 +5422,17 @@ if test -n "$MOZ_WEBM" -a -z "$MOZ_NATIV
     esac
 
 
     dnl Detect if we can use an assembler to compile optimized assembly for libvpx.
     dnl We currently require yasm on all x86 platforms and require yasm 1.1.0 on Win32.
     dnl We currently require gcc on all arm platforms.
     VPX_AS=$YASM
     VPX_ASM_SUFFIX=asm
+    VPX_NEED_OBJ_INT_EXTRACT=
 
     dnl See if we have assembly on this platform.
     case "$OS_ARCH:$CPU_ARCH" in
     Linux:x86)
       VPX_ASFLAGS="-f elf32 -rnasm -pnasm"
       VPX_X86_ASM=1
     ;;
     Linux:x86_64)
@@ -5461,16 +5466,17 @@ if test -n "$MOZ_WEBM" -a -z "$MOZ_NATIV
         dnl Check for yasm 1.1 or greater.
         if test -n "$COMPILE_ENVIRONMENT" -a -z "$YASM"; then
           AC_MSG_ERROR([yasm 1.1 or greater is required to build libvpx on Win32, but it appears not to be installed.  Install it (included in MozillaBuild 1.5.1 and newer) or configure with --disable-webm (which disables the WebM video format). See https://developer.mozilla.org/en/YASM for more details.])
         elif test -n "$COMPILE_ENVIRONMENT" -a "$_YASM_MAJOR_VERSION" -lt "1" -o \( "$_YASM_MAJOR_VERSION" -eq "1" -a "$_YASM_MINOR_VERSION" -lt "1" \) ; then
           AC_MSG_ERROR([yasm 1.1 or greater is required to build libvpx on Win32, but you appear to have version $_YASM_MAJOR_VERSION.$_YASM_MINOR_VERSION.  Upgrade to the newest version (included in MozillaBuild 1.5.1 and newer) or configure with --disable-webm (which disables the WebM video format). See https://developer.mozilla.org/en/YASM for more details.])
         else
           VPX_ASFLAGS="-f win32 -rnasm -pnasm -DPIC"
           VPX_X86_ASM=1
+          dnl The encoder needs obj_int_extract to get asm offsets.
         fi
       fi
     ;;
     *:arm*)
       if test -n "$GNU_AS" ; then
         VPX_AS=$AS
         dnl These flags are a lie; they're just used to enable the requisite
         dnl opcodes; actual arch detection is done at runtime.
@@ -5481,16 +5487,26 @@ if test -n "$MOZ_WEBM" -a -z "$MOZ_NATIV
         VPX_ARM_ASM=1
       fi
     esac
 
     if test -n "$COMPILE_ENVIRONMENT" -a -n "$VPX_X86_ASM" -a -z "$VPX_AS"; then
       AC_MSG_ERROR([yasm is a required build tool for this architecture when webm is enabled. You may either install yasm or --disable-webm (which disables the WebM video format). See https://developer.mozilla.org/en/YASM for more details.])
     fi
 
+    if test -n "$MOZ_VP8_ENCODER" -a \
+            -z "$GNU_CC" -a -z "$INTEL_CC" -a -z "$CLANG_CC" ; then
+      dnl We prefer to get asm offsets using inline assembler, which the above
+      dnl compilers can do. When we're not using one of those, we have to fall
+      dnl back to obj_int_extract, which reads them from a compiled object
+      dnl file. Unfortunately, that only works if we're compiling on a system
+      dnl with the header files for the appropriate object file format.
+      VPX_NEED_OBJ_INT_EXTRACT=1
+    fi
+
     if test -n "$VPX_X86_ASM"; then
       AC_DEFINE(VPX_X86_ASM)
     elif test -n "$VPX_ARM_ASM"; then
       AC_DEFINE(VPX_ARM_ASM)
     else
       AC_MSG_WARN([No assembler or assembly support for libvpx. Using unoptimized C routines.])
     fi
 fi
@@ -8495,25 +8511,27 @@ AC_SUBST(MOZ_APP_EXTRA_LIBS)
 
 AC_SUBST(MOZ_MEDIA)
 AC_SUBST(MOZ_SYDNEYAUDIO)
 AC_SUBST(MOZ_WAVE)
 AC_SUBST(MOZ_VORBIS)
 AC_SUBST(MOZ_TREMOR)
 AC_SUBST(MOZ_WEBM)
 AC_SUBST(MOZ_VP8_ERROR_CONCEALMENT)
+AC_SUBST(MOZ_VP8_ENCODER)
 AC_SUBST(MOZ_OGG)
 AC_SUBST(MOZ_ALSA_LIBS)
 AC_SUBST(VPX_AS)
 AC_SUBST(VPX_ASFLAGS)
 AC_SUBST(VPX_DASH_C_FLAG)
 AC_SUBST(VPX_AS_CONVERSION)
 AC_SUBST(VPX_ASM_SUFFIX)
 AC_SUBST(VPX_X86_ASM)
 AC_SUBST(VPX_ARM_ASM)
+AC_SUBST(VPX_NEED_OBJ_INT_EXTRACT)
 AC_SUBST(MOZ_INSTRUMENT_EVENT_LOOP)
 AC_SUBST(LIBJPEG_TURBO_AS)
 AC_SUBST(LIBJPEG_TURBO_ASFLAGS)
 AC_SUBST(LIBJPEG_TURBO_X86_ASM)
 AC_SUBST(LIBJPEG_TURBO_X64_ASM)
 
 AC_MSG_CHECKING([for posix_fallocate])
 AC_TRY_LINK([#define _XOPEN_SOURCE 600
--- a/media/libvpx/Makefile.in
+++ b/media/libvpx/Makefile.in
@@ -179,16 +179,60 @@ CSRCS += \
   $(NULL)
 
 ifdef MOZ_VP8_ERROR_CONCEALMENT
 CSRCS += \
   error_concealment.c \
   $(NULL)
 endif
 
+ifdef MOZ_VP8_ENCODER
+VPATH += \
+  $(srcdir)/vp8/encoder \
+  $(srcdir)/vp8/encoder/arm \
+  $(srcdir)/vp8/encoder/arm/armv5te \
+  $(srcdir)/vp8/encoder/arm/armv6 \
+  $(srcdir)/vp8/encoder/arm/neon \
+  $(srcdir)/vp8/encoder/generic \
+  $(srcdir)/vp8/encoder/x86 \
+  $(srcdir)/vpx_scale/arm \
+  $(srcdir)/vpx_scale/arm/neon \
+  $(NULL)
+
+CSRCS += \
+  vp8_cx_iface.c \
+  bitstream.c \
+  boolhuff.c \
+  dct.c \
+  encodeframe.c \
+  encodeintra.c \
+  encodemb.c \
+  encodemv.c \
+  ethreading.c \
+  firstpass.c \
+  lookahead.c \
+  mcomp.c \
+  modecosts.c \
+  onyx_if.c \
+  picklpf.c \
+  pickinter.c \
+  psnr.c \
+  quantize.c \
+  ratectrl.c \
+  rdopt.c \
+  sad_c.c \
+  segmentation.c \
+  temporal_filter.c \
+  tokenize.c \
+  treewriter.c \
+  variance_c.c \
+  csystemdependent.c \
+  $(NULL)
+endif
+
 ifdef VPX_X86_ASM
 # Building on an x86 platform with a supported assembler, include
 # the optimized assembly in the build.
 
 CSRCS += \
   idct_blk_mmx.c \
   idct_blk_sse2.c \
   loopfilter_x86.c \
@@ -211,16 +255,56 @@ ASFILES += \
   recon_sse2.asm \
   subpixel_mmx.asm \
   subpixel_sse2.asm \
   subpixel_ssse3.asm \
   dequantize_mmx.asm \
   emms.asm \
   $(NULL)
 
+ifdef MOZ_VP8_ENCODER
+
+CSRCS += \
+  variance_mmx.c \
+  variance_sse2.c \
+  variance_ssse3.c \
+  x86_csystemdependent.c \
+  $(NULL)
+
+ASFILES += \
+  dct_mmx.asm \
+  dct_sse2.asm \
+  encodeopt.asm \
+  fwalsh_sse2.asm \
+  quantize_mmx.asm \
+  quantize_sse2.asm \
+  quantize_ssse3.asm \
+  quantize_sse4.asm \
+  sad_mmx.asm \
+  sad_sse2.asm \
+  sad_sse3.asm \
+  sad_ssse3.asm \
+  sad_sse4.asm \
+  subtract_mmx.asm \
+  subtract_sse2.asm \
+  temporal_filter_apply_sse2.asm \
+  variance_impl_mmx.asm \
+  variance_impl_sse2.asm \
+  variance_impl_ssse3.asm \
+  $(NULL)
+
+# Files which depend on asm_enc_offsets.asm
+VPX_ASM_ENC_OFFSETS_SRCS = \
+  quantize_sse2.asm \
+  quantize_ssse3.asm \
+  quantize_sse4.asm \
+  $(NULL)
+
+endif
+
 endif
 
 ifdef VPX_ARM_ASM
 # Building on an ARM platform with a supported assembler, include
 # the optimized assembly in the build.
 
 # The Android NDK doesn't pre-define anything to indicate the OS it's on, so
 # do it for them.
@@ -286,35 +370,166 @@ VPX_ASFILES = \
   idct_dequant_dc_full_2x_neon.asm \
   idct_dequant_dc_0_2x_neon.asm \
   dequant_idct_neon.asm \
   idct_dequant_full_2x_neon.asm \
   idct_dequant_0_2x_neon.asm \
   dequantizeb_neon.asm \
   $(NULL)
 
+ifdef MOZ_VP8_ENCODER
+CSRCS += \
+  arm_csystemdependent.c \
+  dct_arm.c \
+  picklpf_arm.c \
+  variance_arm.c \
+  $(NULL)
+
+VPX_ASFILES += \
+  vp8_packtokens_armv5.asm \
+  vp8_packtokens_mbrow_armv5.asm \
+  vp8_packtokens_partitions_armv5.asm \
+  vp8_fast_fdct4x4_armv6.asm \
+  vp8_fast_quantize_b_armv6.asm \
+  vp8_mse16x16_armv6.asm \
+  vp8_sad16x16_armv6.asm \
+  vp8_subtract_armv6.asm \
+  vp8_variance16x16_armv6.asm \
+  vp8_variance8x8_armv6.asm \
+  vp8_variance_halfpixvar16x16_h_armv6.asm \
+  vp8_variance_halfpixvar16x16_hv_armv6.asm \
+  vp8_variance_halfpixvar16x16_v_armv6.asm \
+  walsh_v6.asm \
+  fastfdct4x4_neon.asm \
+  fastfdct8x4_neon.asm \
+  fastquantizeb_neon.asm \
+  sad16_neon.asm \
+  sad8_neon.asm \
+  shortfdct_neon.asm \
+  subtract_neon.asm \
+  variance_neon.asm \
+  vp8_memcpy_neon.asm \
+  vp8_mse16x16_neon.asm \
+  vp8_shortwalsh4x4_neon.asm \
+  vp8_subpixelvariance8x8_neon.asm \
+  vp8_subpixelvariance16x16_neon.asm \
+  vp8_subpixelvariance16x16s_neon.asm \
+  vp8_vpxyv12_copyframeyonly_neon.asm \
+  $(NULL)
+
+# Files which depend on asm_com_offsets.asm
+VPX_ASM_COM_OFFSETS_SRCS = \
+  vp8_vpxyv12_copyframeyonly_neon.asm \
+  $(NULL)
+
+# Files which depend on asm_enc_offsets.asm
+VPX_ASM_ENC_OFFSETS_SRCS = \
+  vp8_packtokens_armv5.asm \
+  vp8_packtokens_mbrow_armv5.asm \
+  vp8_packtokens_partitions_armv5.asm \
+  vp8_fast_quantize_b_armv6.asm \
+  vp8_subtract_armv6.asm \
+  fastquantizeb_neon.asm \
+  subtract_neon.asm \
+  $(NULL)
+
+endif
+
 ifdef VPX_AS_CONVERSION
 # The ARM asm is written in ARM RVCT syntax, but we actually build it with
 # gas using GNU syntax. Add some rules to perform the conversion.
 VPX_CONVERTED_ASFILES = $(addsuffix .$(ASM_SUFFIX), $(VPX_ASFILES))
 
 ASFILES += $(VPX_CONVERTED_ASFILES)
 GARBAGE += $(VPX_CONVERTED_ASFILES)
 
 %.asm.$(ASM_SUFFIX): %.asm
 	$(VPX_AS_CONVERSION) < $< > $@
 else
 ASFILES += $(VPX_ASFILES)
 endif
 
 endif
 
+ifdef MOZ_VP8_ENCODER
+ifdef VPX_NEED_OBJ_INT_EXTRACT
+
+# We don't have a compiler that supports a compatible inline asm syntax, so we
+# have to resort to extracting asm offsets from a compiled object. This only
+# works if we have the appropriate system headers obj_int_extract needs to
+# parse that format, and so only has limited support for cross-compilation.
+
+ifdef VPX_ARM_ASM
+VPX_OIE_FORMAT := rvds
+else
+VPX_OIE_FORMAT := gas
+endif
+
+HOST_CSRCS = obj_int_extract.c
+HOST_PROGRAM = host_obj_int_extract$(HOST_BIN_SUFFIX)
+
+GARBAGE += \
+  asm_com_offsets.$(OBJ_SUFFIX) asm_com_offsets.asm \
+  asm_enc_offsets.$(OBJ_SUFFIX) asm_enc_offsets.asm \
+  $(NULL)
+
+else
+
+# We can extract the asm offsets directly from generated assembly using inline
+# asm. This is the preferred method.
+
+asm_com_offsets.s: CFLAGS += -DINLINE_ASM
+asm_enc_offsets.s: CFLAGS += -DINLINE_ASM
+
+asm_com_offsets.asm: asm_com_offsets.s
+	grep \\\<EQU\\\> $< | sed -e 's/[$$\#]//g' \
+	    $(if $(VPX_AS_CONVERSION),| $(VPX_AS_CONVERSION)) > $@
+
+asm_enc_offsets.asm: asm_enc_offsets.s
+	grep \\\<EQU\\\> $< | sed -e 's/[$$\#]//g' \
+	    $(if $(VPX_AS_CONVERSION),| $(VPX_AS_CONVERSION)) > $@
+
+GARBAGE += \
+  asm_com_offsets.s asm_com_offsets.asm \
+  asm_enc_offsets.s asm_enc_offsets.asm \
+  $(NULL)
+
+endif
+endif
+
 include $(topsrcdir)/config/rules.mk
 
+# This must be after rules.mk in order to use $(OBJ_SUFFIX) outside a
+# recursively-expanded variable.
+ifdef MOZ_VP8_ENCODER
+
+ifdef VPX_NEED_OBJ_INT_EXTRACT
+
+asm_com_offsets.asm: asm_com_offsets.$(OBJ_SUFFIX) $(HOST_PROGRAM)
+	./$(HOST_PROGRAM) $(VPX_OIE_FORMAT) $< \
+	    $(if $(VPX_AS_CONVERSION),| $(VPX_AS_CONVERSION)) > $@
+
+asm_enc_offsets.asm: asm_enc_offsets.$(OBJ_SUFFIX) $(HOST_PROGRAM)
+	./$(HOST_PROGRAM) $(VPX_OIE_FORMAT) $< \
+	    $(if $(VPX_AS_CONVERSION),| $(VPX_AS_CONVERSION)) > $@
+
+endif
+
+# These dependencies are not generated automatically, so do it manually.
+ifdef VPX_AS_CONVERSION
+$(addsuffix .$(OBJ_SUFFIX), $(VPX_ASM_COM_OFFSETS_SRCS)): asm_com_offsets.asm
+$(addsuffix .$(OBJ_SUFFIX), $(VPX_ASM_ENC_OFFSETS_SRCS)): asm_enc_offsets.asm
+else
+$(patsubst %.$(ASM_SUFFIX),%.$(OBJ_SUFFIX), $(VPX_ASM_COM_OFFSETS_SRCS)): asm_com_offsets.asm
+$(patsubst %.$(ASM_SUFFIX),%.$(OBJ_SUFFIX), $(VPX_ASM_ENC_OFFSETS_SRCS)): asm_enc_offsets.asm
+endif
+
+endif
+
 # Workaround a bug of Sun Studio (CR 6963410)
 ifdef SOLARIS_SUNPRO_CC
 ifeq (86,$(findstring 86,$(OS_TEST)))
-filter_c.o: filter_c.c Makefile.in
+filter.o: filter.c Makefile.in
 	$(REPORT_BUILD)
 	@$(MAKE_DEPS_AUTO_CC)
 	$(CC) -o $@ -c $(patsubst -xO[45],-xO3,$(COMPILE_CFLAGS)) $<
 endif
 endif
--- a/media/libvpx/update.sh
+++ b/media/libvpx/update.sh
@@ -46,18 +46,20 @@ if [ $# -lt 1 ]; then
   echo You can configure these from objdir/$target with the following command:
   echo $ ..configure --target=$target --disable-vp8-encoder --disable-examples --disable-install-docs
   echo On Mac, you also need --enable-pic
   exit -1
 fi
 
 # These are relative to SDK source dir.
 commonFiles=(
+  vp8/vp8_cx_iface.c
   vp8/vp8_dx_iface.c
   vp8/common/alloccommon.c
+  vp8/common/asm_com_offsets.c
   vp8/common/blockd.c
   vp8/common/debugmodes.c
   vp8/common/defaultcoefcounts.c
   vp8/common/entropy.c
   vp8/common/entropymode.c
   vp8/common/entropymv.c
   vp8/common/extend.c
   vp8/common/filter.c
@@ -103,16 +105,51 @@ commonFiles=(
   vp8/decoder/arm/arm_dsystemdependent.c
   vp8/decoder/arm/dequantize_arm.c
   vp8/decoder/arm/armv6/idct_blk_v6.c
   vp8/decoder/arm/neon/idct_blk_neon.c
   vp8/decoder/generic/dsystemdependent.c
   vp8/decoder/x86/idct_blk_mmx.c
   vp8/decoder/x86/idct_blk_sse2.c
   vp8/decoder/x86/x86_dsystemdependent.c
+  vp8/encoder/asm_enc_offsets.c
+  vp8/encoder/bitstream.c
+  vp8/encoder/boolhuff.c
+  vp8/encoder/dct.c
+  vp8/encoder/encodeframe.c
+  vp8/encoder/encodeintra.c
+  vp8/encoder/encodemb.c
+  vp8/encoder/encodemv.c
+  vp8/encoder/ethreading.c
+  vp8/encoder/firstpass.c
+  vp8/encoder/lookahead.c
+  vp8/encoder/mcomp.c
+  vp8/encoder/modecosts.c
+  vp8/encoder/onyx_if.c
+  vp8/encoder/picklpf.c
+  vp8/encoder/pickinter.c
+  vp8/encoder/psnr.c
+  vp8/encoder/quantize.c
+  vp8/encoder/ratectrl.c
+  vp8/encoder/rdopt.c
+  vp8/encoder/sad_c.c
+  vp8/encoder/segmentation.c
+  vp8/encoder/temporal_filter.c
+  vp8/encoder/tokenize.c
+  vp8/encoder/treewriter.c
+  vp8/encoder/variance_c.c
+  vp8/encoder/arm/arm_csystemdependent.c
+  vp8/encoder/arm/dct_arm.c
+  vp8/encoder/arm/picklpf_arm.c
+  vp8/encoder/arm/variance_arm.c
+  vp8/encoder/generic/csystemdependent.c
+  vp8/encoder/x86/variance_mmx.c
+  vp8/encoder/x86/variance_sse2.c
+  vp8/encoder/x86/variance_ssse3.c
+  vp8/encoder/x86/x86_csystemdependent.c
   vpx/src/vpx_codec.c
   vpx/src/vpx_decoder.c
   vpx/src/vpx_decoder_compat.c
   vpx/src/vpx_encoder.c
   vpx/src/vpx_image.c
   vpx_mem/vpx_mem.c
   vpx_scale/generic/gen_scalers.c
   vpx_scale/generic/scalesystemdependent.c
@@ -173,32 +210,66 @@ commonFiles=(
   vp8/decoder/detokenize.h
   vp8/decoder/ec_types.h
   vp8/decoder/error_concealment.h
   vp8/decoder/onyxd_int.h
   vp8/decoder/reconintra_mt.h
   vp8/decoder/treereader.h
   vp8/decoder/arm/dequantize_arm.h
   vp8/decoder/x86/dequantize_x86.h
+  vp8/encoder/asm_enc_offsets.h
+  vp8/encoder/bitstream.h
+  vp8/encoder/boolhuff.h
+  vp8/encoder/block.h
+  vp8/encoder/dct.h
+  vp8/encoder/encodeintra.h
+  vp8/encoder/encodemb.h
+  vp8/encoder/encodemv.h
+  vp8/encoder/firstpass.h
+  vp8/encoder/lookahead.h
+  vp8/encoder/mcomp.h
+  vp8/encoder/modecosts.h
+  vp8/encoder/onyx_int.h
+  vp8/encoder/pickinter.h
+  vp8/encoder/psnr.h
+  vp8/encoder/quantize.h
+  vp8/encoder/ratectrl.h
+  vp8/encoder/rdopt.h
+  vp8/encoder/segmentation.h
+  vp8/encoder/temporal_filter.h
+  vp8/encoder/tokenize.h
+  vp8/encoder/treewriter.h
+  vp8/encoder/variance.h
+  vp8/encoder/arm/dct_arm.h
+  vp8/encoder/arm/encodemb_arm.h
+  vp8/encoder/arm/quantize_arm.h
+  vp8/encoder/arm/variance_arm.h
+  vp8/encoder/x86/dct_x86.h
+  vp8/encoder/x86/encodemb_x86.h
+  vp8/encoder/x86/mcomp_x86.h
+  vp8/encoder/x86/quantize_x86.h
+  vp8/encoder/x86/temporal_filter_x86.h
+  vp8/encoder/x86/variance_x86.h
   vpx/internal/vpx_codec_internal.h
   vpx/vp8cx.h
   vpx/vp8dx.h
   vpx/vp8e.h
   vpx/vp8.h
   vpx/vpx_codec.h
   vpx/vpx_codec_impl_bottom.h
   vpx/vpx_codec_impl_top.h
   vpx/vpx_decoder_compat.h
   vpx/vpx_decoder.h
   vpx/vpx_encoder.h
   vpx/vpx_image.h
   vpx/vpx_integer.h
   vpx_mem/include/vpx_mem_intrnl.h
   vpx_mem/vpx_mem.h
   vpx_ports/arm_cpudetect.c
+  vpx_ports/asm_offsets.h
   vpx_ports/config.h
   vpx_ports/mem.h
   vpx_ports/vpx_timer.h
   vpx_ports/arm.h
   vpx_ports/x86.h
   vpx_scale/scale_mode.h
   vpx_scale/vpxscale.h
   vpx_scale/yv12config.h
@@ -258,18 +329,66 @@ commonFiles=(
   vp8/common/x86/postproc_mmx.asm
   vp8/common/x86/postproc_sse2.asm
   vp8/common/x86/recon_mmx.asm
   vp8/common/x86/recon_sse2.asm
   vp8/common/x86/subpixel_mmx.asm
   vp8/common/x86/subpixel_sse2.asm
   vp8/common/x86/subpixel_ssse3.asm
   vp8/decoder/x86/dequantize_mmx.asm
+  vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+  vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+  vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+  vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
+  vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
+  vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+  vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm
+  vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+  vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+  vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
+  vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+  vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+  vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+  vp8/encoder/arm/armv6/walsh_v6.asm
+  vp8/encoder/arm/neon/fastfdct4x4_neon.asm
+  vp8/encoder/arm/neon/fastfdct8x4_neon.asm
+  vp8/encoder/arm/neon/fastquantizeb_neon.asm
+  vp8/encoder/arm/neon/sad16_neon.asm
+  vp8/encoder/arm/neon/sad8_neon.asm
+  vp8/encoder/arm/neon/shortfdct_neon.asm
+  vp8/encoder/arm/neon/subtract_neon.asm
+  vp8/encoder/arm/neon/variance_neon.asm
+  vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+  vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
+  vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+  vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
+  vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
+  vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+  vp8/encoder/x86/dct_mmx.asm
+  vp8/encoder/x86/dct_sse2.asm
+  vp8/encoder/x86/encodeopt.asm
+  vp8/encoder/x86/fwalsh_sse2.asm
+  vp8/encoder/x86/quantize_mmx.asm
+  vp8/encoder/x86/quantize_sse2.asm
+  vp8/encoder/x86/quantize_ssse3.asm
+  vp8/encoder/x86/quantize_sse4.asm
+  vp8/encoder/x86/sad_mmx.asm
+  vp8/encoder/x86/sad_sse2.asm
+  vp8/encoder/x86/sad_sse3.asm
+  vp8/encoder/x86/sad_ssse3.asm
+  vp8/encoder/x86/sad_sse4.asm
+  vp8/encoder/x86/subtract_mmx.asm
+  vp8/encoder/x86/subtract_sse2.asm
+  vp8/encoder/x86/temporal_filter_apply_sse2.asm
+  vp8/encoder/x86/variance_impl_mmx.asm
+  vp8/encoder/x86/variance_impl_sse2.asm
+  vp8/encoder/x86/variance_impl_ssse3.asm
   vpx_ports/emms.asm
   vpx_ports/x86_abi_support.asm
+  vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm
   build/make/ads2gas.pl
   build/make/obj_int_extract.c
   LICENSE
   PATENTS
 )
 
 # configure files specific to x86-win32-vs8
 cp $1/objdir/x86-win32-vs8/vpx_config.c vpx_config_x86-win32-vs8.c
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/common/asm_com_offsets.c
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_scale/yv12config.h"
+
+BEGIN
+
+/* vpx_scale */
+DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
+DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
+
+#if HAVE_ARMV7
+/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
+ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/arm_csystemdependent.c
@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
+#include "vp8/encoder/variance.h"
+#include "vp8/encoder/onyx_int.h"
+
+extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+
+void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    int flags = cpi->common.rtcd.flags;
+
+#if HAVE_ARMV5TE
+    if (flags & HAS_EDSP)
+    {
+    }
+#endif
+
+#if HAVE_ARMV6
+    if (flags & HAS_MEDIA)
+    {
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_armv6;
+        /*cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;*/
+
+        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;*/
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_armv6;
+        /*cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;
+
+        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;*/
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_armv6;
+        /*cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;
+        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_armv6;
+        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_armv6;
+        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_armv6;
+
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_armv6;
+        /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
+
+        /*cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/
+
+        /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;*/
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_armv6;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_armv6;
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;
+
+        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;*/
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_armv6;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_armv6;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_armv6;
+
+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;
+    }
+#endif
+
+#if HAVE_ARMV7
+    if (flags & HAS_NEON)
+    {
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_neon;
+        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_neon;
+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_neon;
+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_neon;
+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_neon;
+
+        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;*/
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_neon;
+        cpi->rtcd.variance.var8x16               = vp8_variance8x16_neon;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_neon;
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_neon;
+
+        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;*/
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_neon;
+        /*cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_neon;
+        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_neon;
+        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_neon;
+        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_neon;
+
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_neon;
+        /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
+
+        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_neon;
+
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_neon;
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_neon;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_neon;
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_neon;
+
+        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;*/
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_neon;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_neon;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_neon;
+
+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+        cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;*/
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;
+        cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_neon;
+    }
+#endif
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (flags & HAS_NEON)
+#endif
+    {
+        vp8_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
+    }
+#endif
+#endif
+}
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -0,0 +1,291 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_tokens_armv5|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 vp8_writer *w
+; r1 const TOKENEXTRA *p
+; r2 int xcount
+; r3 vp8_coef_encodings
+; s0 vp8_extra_bits
+; s1 vp8_coef_tree
+|vp8cx_pack_tokens_armv5| PROC
+    push    {r4-r11, lr}
+
+    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
+    ;  sizeof (TOKENEXTRA) is 8
+    sub     sp, sp, #12
+    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
+    str     r2, [sp, #0]
+    str     r3, [sp, #8]                ; save vp8_coef_encodings
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+    b       check_p_lt_stop
+
+while_p_lt_stop
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #8]                ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp8_token_value]  ; v
+    ldr     r8, [r4, #vp8_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #52]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsl     r12, r6, r4                ; r12 = v << 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r12, r12, #1                ; bb = v >> n
+    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #52]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #48]               ; vp8_extra_bits
+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rsb     r4, r8, #32
+    lsl     r12, r7, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsls    r12, r12, #1                ; v >> n
+    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp8_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp8_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp8_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #0x10]
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    add     sp, sp, #12
+    pop     {r4-r11, pc}
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -0,0 +1,327 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 vp8_writer *w
+; r2 vp8_coef_encodings
+; r3 vp8_extra_bits
+; s0 vp8_coef_tree
+
+|vp8cx_pack_mb_row_tokens_armv5| PROC
+    push    {r4-r11, lr}
+    sub     sp, sp, #24
+
+    ; Compute address of cpi->common.mb_rows
+    ldr     r4, _VP8_COMP_common_
+    ldr     r6, _VP8_COMMON_MBrows_
+    add     r4, r0, r4
+
+    ldr     r5, [r4, r6]                ; load up mb_rows
+
+    str     r2, [sp, #20]               ; save vp8_coef_encodings
+    str     r5, [sp, #12]               ; save mb_rows
+    str     r3, [sp, #8]                ; save vp8_extra_bits
+
+    ldr     r4, _VP8_COMP_tplist_
+    add     r4, r0, r4
+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+
+    mov     r0, r1                      ; keep same as other loops
+
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+
+mb_row_loop
+
+    ldr     r1, [r7, #tokenlist_start]
+    ldr     r9, [r7, #tokenlist_stop]
+    str     r9, [sp, #0]                ; save stop for later comparison
+    str     r7, [sp, #16]               ; tokenlist address for next time
+
+    b       check_p_lt_stop
+
+    ; actuall work gets done here!
+
+while_p_lt_stop
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #20]               ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp8_token_value]  ; v
+    ldr     r8, [r4, #vp8_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsl     r12, r6, r4                 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r12, r12, #1                ; bb = v >> n
+    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #8]                ; vp8_extra_bits
+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rsb     r4, r8, #32
+    lsl     r12, r7, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsls    r12, r12, #1                ; v >> n
+    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp8_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp8_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp8_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #0x10]
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    ldr     r6, [sp, #12]               ; mb_rows
+    ldr     r7, [sp, #16]               ; tokenlist address
+    subs    r6, r6, #1
+    add     r7, r7, #TOKENLIST_SZ       ; next element in the array
+    str     r6, [sp, #12]
+    bne     mb_row_loop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    add     sp, sp, #24
+    pop     {r4-r11, pc}
+    ENDP
+
+_VP8_COMP_common_
+    DCD     vp8_comp_common
+_VP8_COMMON_MBrows_
+    DCD     vp8_common_mb_rows
+_VP8_COMP_tplist_
+    DCD     vp8_comp_tplist
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -0,0 +1,465 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 unsigned char *cx_data
+; r2 int num_part
+; r3 *size
+; s0 vp8_coef_encodings
+; s1 vp8_extra_bits,
+; s2 const vp8_tree_index *,
+
+|vp8cx_pack_tokens_into_partitions_armv5| PROC
+    push    {r4-r11, lr}
+    sub     sp, sp, #44
+
+    ; Compute address of cpi->common.mb_rows
+    ldr     r4, _VP8_COMP_common_
+    ldr     r6, _VP8_COMMON_MBrows_
+    add     r4, r0, r4
+
+    ldr     r5, [r4, r6]                ; load up mb_rows
+
+    str     r5, [sp, #36]               ; save mb_rows
+    str     r1, [sp, #24]               ; save cx_data
+    str     r2, [sp, #20]               ; save num_part
+    str     r3, [sp, #8]                ; save *size
+
+    ; *size = 3*(num_part -1 );
+    sub     r2, r2, #1                  ; num_part - 1
+    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)
+    str     r2, [r3]
+
+    add     r2, r2, r1                  ; cx_data + *size
+    str     r2, [sp, #40]               ; ptr
+
+    ldr     r4, _VP8_COMP_tplist_
+    add     r4, r0, r4
+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+    str     r7, [sp, #32]               ; store start of cpi->tp_list
+
+    ldr     r11, _VP8_COMP_bc2_         ; load up vp8_writer out of cpi
+    add     r0, r0, r11
+
+    mov     r11, #0
+    str     r11, [sp, #28]              ; i
+
+numparts_loop
+    ldr     r10, [sp, #40]              ; ptr
+    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
+    sub     r5, r5, r11                 ; move start point with each partition
+                                        ; mb_rows starts at i
+    str     r5,  [sp, #12]
+
+    ; Reset all of the VP8 Writer data for each partition that
+    ; is processed.
+    ; start_encode
+    mov     r2, #0                      ; vp8_writer_lowvalue
+    mov     r5, #255                    ; vp8_writer_range
+    mvn     r3, #23                     ; vp8_writer_count
+
+    str     r2,  [r0, #vp8_writer_value]
+    str     r2,  [r0, #vp8_writer_pos]
+    str     r10, [r0, #vp8_writer_buffer]
+
+mb_row_loop
+
+    ldr     r1, [r7, #tokenlist_start]
+    ldr     r9, [r7, #tokenlist_stop]
+    str     r9, [sp, #0]                ; save stop for later comparison
+    str     r7, [sp, #16]               ; tokenlist address for next time
+
+    b       check_p_lt_stop
+
+    ; actual work gets done here!
+
+while_p_lt_stop
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #80]               ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp8_token_value]  ; v
+    ldr     r8, [r4, #vp8_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #88]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsl     r12, r6, r4                ; r12 = v << 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r12, r12, #1                ; bb = v >> n
+    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #88]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #84]                ; vp8_extra_bits
+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rsb     r4, r8, #32
+    lsl     r12, r7, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsls    r12, r12, #1                ; v >> n
+    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp8_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp8_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp8_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #0x10]
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    ldr     r10, [sp, #20]              ; num_parts
+    mov     r1, #TOKENLIST_SZ
+    mul     r1, r10, r1
+
+    ldr     r6, [sp, #12]               ; mb_rows
+    ldr     r7, [sp, #16]               ; tokenlist address
+    subs    r6, r6, r10
+    add     r7, r7, r1                  ; next element in the array
+    str     r6, [sp, #12]
+    bgt     mb_row_loop
+
+    mov     r12, #32
+
+stop_encode_loop
+    sub     r7, r5, #1                  ; range-1
+
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_se      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_se
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_se
+token_zero_while_loop_se
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_se
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop_se
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set_se
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r12, r12, #1
+    bne     stop_encode_loop
+
+    ldr     r10, [sp, #8]               ; *size
+    ldr     r11, [r10]
+    ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos
+    add     r11, r11, r4                ; *size += w->pos
+    str     r11, [r10]
+
+    ldr     r9, [sp, #20]               ; num_parts
+    sub     r9, r9, #1
+    ldr     r10, [sp, #28]              ; i
+    cmp     r10, r9                     ; if(i<(num_part - 1))
+    bge     skip_write_partition
+
+    ldr     r12, [sp, #40]              ; ptr
+    add     r12, r12, r4                ; ptr += w->pos
+    str     r12, [sp, #40]
+
+    ldr     r9, [sp, #24]               ; cx_data
+    mov     r8, r4, asr #8
+    strb    r4, [r9, #0]
+    strb    r8, [r9, #1]
+    mov     r4, r4, asr #16
+    strb    r4, [r9, #2]
+
+    add     r9, r9, #3                  ; cx_data += 3
+    str     r9, [sp, #24]
+
+skip_write_partition
+
+    ldr     r11, [sp, #28]              ; i
+    ldr     r10, [sp, #20]              ; num_parts
+
+    add     r11, r11, #1                ; i++
+    str     r11, [sp, #28]
+
+    ldr     r7, [sp, #32]               ; cpi->tp_list[i]
+    mov     r1, #TOKENLIST_SZ
+    add     r7, r7, r1                  ; next element in cpi->tp_list
+    str     r7, [sp, #32]               ; cpi->tp_list[i+1]
+
+    cmp     r10, r11
+    bgt     numparts_loop
+
+
+    add     sp, sp, #44
+    pop     {r4-r11, pc}
+    ENDP
+
+_VP8_COMP_common_
+    DCD     vp8_comp_common
+_VP8_COMMON_MBrows_
+    DCD     vp8_common_mb_rows
+_VP8_COMP_tplist_
+    DCD     vp8_comp_tplist
+_VP8_COMP_bc2_
+    DCD     vp8_comp_bc2
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
@@ -0,0 +1,262 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_fast_fdct4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_fast_fdct4x4_armv6| PROC
+
+    stmfd       sp!, {r4 - r12, lr}
+
+    ; PART 1
+
+    ; coeffs 0-3
+    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
+
+    ldr         r10, c7500
+    ldr         r11, c14500
+    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
+    ldr         lr, c0x00080008
+    ror         r5, r5, #16         ; [i2 | i3]
+
+    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
+    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
+
+    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
+
+    str         r6, [r1, #4]
+
+    ; coeffs 4-7
+    ror         r9, r9, #16         ; [i6 | i7]
+
+    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
+    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
+
+    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
+
+    str         r6, [r1, #12]
+
+    ; coeffs 8-11
+    ror         r5, r5, #16         ; [i10 | i11]
+
+    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
+    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
+
+    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
+
+    str         r6, [r1, #20]
+
+    ; coeffs 12-15
+    ror         r5, r5, #16         ; [i14 | i15]
+
+    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
+    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
+    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
+
+    str         r6, [r1, #28]
+
+
+    ; PART 2 -------------------------------------------------
+    ldr         r11, c12000
+    ldr         r10, c51000
+    ldr         lr, c0x00070007
+
+    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
+    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
+    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
+    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    add         r0, r11, #0x10000   ; add (d!=0)
+
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    ldr         r12, c0x08a914e8    ; [2217 | 5352]
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #0]        ; [     o1 |      o0]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #16]       ; [     o9 |      o8]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    ldr         r3, [r1, #4]        ; [i3 | i2]
+
+    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
+
+    str         r9, [r1, #8]        ; [o5 | 04]
+
+    ldr         r9, [r1, #12]       ; [i7 | i6]
+    ldr         r8, [r1, #28]       ; [i15|i14]
+    ldr         r2, [r1, #20]       ; [i11|i10]
+    str         r5, [r1, #24]       ; [o13|o12]
+
+    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
+    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #4]        ; [     o3 |      o2]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #20]       ; [    o11 |     o10]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    str         r9, [r1, #12]       ; [o7 | o6]
+    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
+
+    str         r5, [r1, #28]       ; [o15|o14]
+
+    ldmfd       sp!, {r4 - r12, pc}
+
+    ENDP
+
+; Used constants
+c7500
+    DCD     7500
+c14500
+    DCD     14500
+c0x22a453a0
+    DCD     0x22a453a0
+c0x00080008
+    DCD     0x00080008
+c12000
+    DCD     12000
+c51000
+    DCD     51000
+c0x00070007
+    DCD     0x00070007
+c0x08a914e8
+    DCD     0x08a914e8
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
@@ -0,0 +1,224 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_quantize_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *b
+; r1    BLOCKD *d
+|vp8_fast_quantize_b_armv6| PROC
+    stmfd   sp!, {r1, r4-r11, lr}
+
+    ldr     r3, [r0, #vp8_block_coeff]      ; coeff
+    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
+    ldr     r5, [r0, #vp8_block_round]      ; round
+    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
+    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
+    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
+
+    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
+                                    ; is used to update the counter so that
+                                    ; it can be used to mark nonzero
+                                    ; quantized coefficient pairs.
+
+    mov     r1, #0                  ; flags for quantized coeffs
+
+    ; PART 1: quantization and dequantization loop
+loop
+    ldr     r9, [r3], #4            ; [z1 | z0]
+    ldr     r10, [r5], #4           ; [r1 | r0]
+    ldr     r11, [r4], #4           ; [q1 | q0]
+
+    ssat16  lr, #1, r9              ; [sz1 | sz0]
+    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
+    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
+    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
+
+    ldr     r12, [r3], #4           ; [z3 | z2]
+
+    smulbb  r0, r9, r11             ; [(x0+r0)*q0]
+    smultt  r9, r9, r11             ; [(x1+r1)*q1]
+
+    ldr     r10, [r5], #4           ; [r3 | r2]
+
+    ssat16  r11, #1, r12            ; [sz3 | sz2]
+    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
+    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
+    ldr     r9, [r4], #4            ; [q3 | q2]
+    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
+
+    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
+
+    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
+
+    smulbb  r10, r12, r9            ; [(x2+r2)*q2]
+    smultt  r12, r12, r9            ; [(x3+r3)*q3]
+
+    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
+
+    cmp     r0, #0                  ; check if zero
+    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
+
+    str     r0, [r6], #4            ; *qcoeff++ = x
+    ldr     r9, [r8], #4            ; [dq1 | dq0]
+
+    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
+    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
+    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
+
+    cmp     r10, #0                 ; check if zero
+    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
+
+    str     r10, [r6], #4           ; *qcoeff++ = x
+    ldr     r11, [r8], #4           ; [dq3 | dq2]
+
+    smulbb  r12, r0, r9             ; [x0*dq0]
+    smultt  r0, r0, r9              ; [x1*dq1]
+
+    smulbb  r9, r10, r11            ; [x2*dq2]
+    smultt  r10, r10, r11           ; [x3*dq3]
+
+    lsls    r2, r2, #2              ; update loop counter
+    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
+    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
+    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
+    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
+    add     r7, r7, #8              ; dqcoeff += 8
+    bne     loop
+
+    ; PART 2: check position for eob...
+    mov     lr, #0                  ; init eob
+    cmp     r1, #0                  ; coeffs after quantization?
+    ldr     r11, [sp, #0]           ; restore BLOCKD pointer
+    beq     end                     ; skip eob calculations if all zero
+
+    ldr     r0, [r11, #vp8_blockd_qcoeff]
+
+    ; check shortcut for nonzero qcoeffs
+    tst    r1, #0x80
+    bne    quant_coeff_15_14
+    tst    r1, #0x20
+    bne    quant_coeff_13_11
+    tst    r1, #0x8
+    bne    quant_coeff_12_7
+    tst    r1, #0x40
+    bne    quant_coeff_10_9
+    tst    r1, #0x10
+    bne    quant_coeff_8_3
+    tst    r1, #0x2
+    bne    quant_coeff_6_5
+    tst    r1, #0x4
+    bne    quant_coeff_4_2
+    b      quant_coeff_1_0
+
+quant_coeff_15_14
+    ldrh    r2, [r0, #30]       ; rc=15, i=15
+    mov     lr, #16
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #28]       ; rc=14, i=14
+    mov     lr, #15
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_13_11
+    ldrh    r2, [r0, #22]       ; rc=11, i=13
+    mov     lr, #14
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_12_7
+    ldrh    r3, [r0, #14]       ; rc=7,  i=12
+    mov     lr, #13
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #20]       ; rc=10, i=11
+    mov     lr, #12
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_10_9
+    ldrh    r3, [r0, #26]       ; rc=13, i=10
+    mov     lr, #11
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #24]       ; rc=12, i=9
+    mov     lr, #10
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_8_3
+    ldrh    r3, [r0, #18]       ; rc=9,  i=8
+    mov     lr, #9
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #12]       ; rc=6,  i=7
+    mov     lr, #8
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_6_5
+    ldrh    r3, [r0, #6]        ; rc=3,  i=6
+    mov     lr, #7
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #4]        ; rc=2,  i=5
+    mov     lr, #6
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_4_2
+    ldrh    r3, [r0, #10]       ; rc=5,  i=4
+    mov     lr, #5
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #16]       ; rc=8,  i=3
+    mov     lr, #4
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #8]        ; rc=4,  i=2
+    mov     lr, #3
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_1_0
+    ldrh    r2, [r0, #2]        ; rc=1,  i=1
+    mov     lr, #2
+    cmp     r2, #0
+    bne     end
+
+    mov     lr, #1              ; rc=0,  i=0
+
+end
+    str     lr, [r11, #vp8_blockd_eob]
+    ldmfd   sp!, {r1, r4-r11, pc}
+
+    ENDP
+
+loop_count
+    DCD     0x1000000
+
+    END
+
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@@ -0,0 +1,138 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+    push    {r4-r9, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r4, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm
@@ -0,0 +1,96 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    const unsigned char *src_ptr
+; r1    int  src_stride
+; r2    const unsigned char *ref_ptr
+; r3    int  ref_stride
+; stack max_sad (not used)
+|vp8_sad16x16_armv6| PROC
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    mov     r4, #0              ; sad = 0;
+    mov     r5, #8              ; loop count
+
+loop
+    ; 1st row
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
+
+    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
+    add     r4, r4, r8          ; add partial sad values
+
+    ; 2nd row
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
+
+    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    subs    r5, r5, #1          ; decrement loop counter
+    add     r4, r4, r8          ; add partial sad values
+
+    bne     loop
+
+    mov     r0, r4              ; return sad
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
@@ -0,0 +1,265 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_subtract_mby_armv6|
+    EXPORT  |vp8_subtract_mbuv_armv6|
+    EXPORT  |vp8_subtract_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *be
+; r1    BLOCKD *bd
+; r2    int pitch
+|vp8_subtract_b_armv6| PROC
+
+    stmfd   sp!, {r4-r9}
+
+    ldr     r4, [r0, #vp8_block_base_src]
+    ldr     r5, [r0, #vp8_block_src]
+    ldr     r6, [r0, #vp8_block_src_diff]
+
+    ldr     r3, [r4]
+    ldr     r7, [r0, #vp8_block_src_stride]
+    add     r3, r3, r5          ; src = *base_src + src
+    ldr     r8, [r1, #vp8_blockd_predictor]
+
+    mov     r9, #4              ; loop count
+
+loop_block
+
+    ldr     r0, [r3], r7        ; src
+    ldr     r1, [r8], r2        ; pred
+
+    uxtb16  r4, r0              ; [s2 | s0]
+    uxtb16  r5, r1              ; [p2 | p0]
+    uxtb16  r0, r0, ror #8      ; [s3 | s1]
+    uxtb16  r1, r1, ror #8      ; [p3 | p1]
+
+    usub16  r4, r4, r5          ; [d2 | d0]
+    usub16  r5, r0, r1          ; [d3 | d1]
+
+    subs    r9, r9, #1          ; decrement loop counter
+
+    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
+    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
+
+    str     r0, [r6, #0]        ; diff
+    str     r1, [r6, #4]        ; diff
+
+    add     r6, r6, r2, lsl #1  ; update diff pointer
+    bne     loop_block
+
+    ldmfd   sp!, {r4-r9}
+    mov     pc, lr
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *usrc
+; r2    unsigned char *vsrc
+; r3    unsigned char *pred
+; stack int stride
+|vp8_subtract_mbuv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    add     r0, r0, #512        ; set *diff point to Cb
+    add     r3, r3, #256        ; set *pred point to Cb
+
+    mov     r4, #8              ; loop count
+    ldr     r5, [sp, #40]       ; stride
+
+    ; Subtract U block
+loop_u
+    ldr     r6, [r1]            ; src       (A)
+    ldr     r7, [r3], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; src       (B)
+    ldr     r11, [r3], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r1, r1, r5          ; update usrc pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_u
+
+    mov     r4, #8              ; loop count
+
+    ; Subtract V block
+loop_v
+    ldr     r6, [r2]            ; src       (A)
+    ldr     r7, [r3], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r2, #4]       ; src       (B)
+    ldr     r11, [r3], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r2, r2, r5          ; update vsrc pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_v
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *src
+; r2    unsigned char *pred
+; r3    int stride
+|vp8_subtract_mby_armv6| PROC
+
+    stmfd   sp!, {r4-r11}
+
+    mov     r4, #16
+loop
+    ldr     r6, [r1]            ; src       (A)
+    ldr     r7, [r2], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; src       (B)
+    ldr     r11, [r2], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    ldr     r10, [r1, #8]       ; src       (C)
+    ldr     r11, [r2], #4       ; pred      (C)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    uxtb16  r8, r10             ; [s2 | s0] (C)
+    str     r9, [r0], #4        ; diff      (B)
+
+    uxtb16  r9, r11             ; [p2 | p0] (C)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (C)
+    usub16  r7, r10, r11        ; [d3 | d1] (C)
+
+    ldr     r10, [r1, #12]      ; src       (D)
+    ldr     r11, [r2], #4       ; pred      (D)
+
+    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)
+    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)
+
+    str     r8, [r0], #4        ; diff      (C)
+    uxtb16  r8, r10             ; [s2 | s0] (D)
+    str     r9, [r0], #4        ; diff      (C)
+
+    uxtb16  r9, r11             ; [p2 | p0] (D)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (D)
+    usub16  r7, r10, r11        ; [d3 | d1] (D)
+
+    add     r1, r1, r3          ; update src pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
+
+    str     r8, [r0], #4        ; diff      (D)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (D)
+
+    bne     loop
+
+    ldmfd   sp!, {r4-r11}
+    mov     pc, lr
+
+    ENDP
+
+    END
+
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -0,0 +1,154 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
@@ -0,0 +1,101 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance8x8_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_armv6| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -0,0 +1,182 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_h_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -0,0 +1,222 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; pointer to pixels on the next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load source pixels a, row N
+    ldr     r6, [r0, #1]        ; load source pixels b, row N
+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load source pixels a, row N
+    ldr     r6, [r0, #5]        ; load source pixels b, row N
+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load source pixels a, row N
+    ldr     r6, [r0, #9]        ; load source pixels b, row N
+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load source pixels a, row N
+    ldr     r6, [r0, #13]       ; load source pixels b, row N
+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -0,0 +1,184 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_v_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; set src pointer to next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm
@@ -0,0 +1,145 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_walsh4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+|vp8_short_walsh4x4_armv6| PROC
+
+    stmdb       sp!, {r4 - r11, lr}
+
+    mov         r12, r2              ; ugh. not clean
+    ldr         r2, [r0]             ; [1  |  0]
+    ldr         r3, [r0, #4]         ; [3  |  2]
+    ldr         r4, [r0, r12]!       ; [5  |  4]
+    ldr         r5, [r0, #4]         ; [7  |  6]
+    ldr         r6, [r0, r12]!       ; [9  |  8]
+    ldr         r7, [r0, #4]         ; [11 | 10]
+    ldr         r8, [r0, r12]!       ; [13 | 12]
+    ldr         r9, [r0, #4]         ; [15 | 14]
+
+    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
+    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
+    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
+    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
+
+    qaddsubx    r2, r10, r11         ; [1 | 2] [c1+d1 | a1-b1]
+    qaddsubx    r3, r11, r10         ; [0 | 3] [b1+a1 | d1-c1]
+    qaddsubx    r4, r12, lr          ; [5 | 6] [c1+d1 | a1-b1]
+    qaddsubx    r5, lr, r12          ; [4 | 7] [b1+a1 | d1-c1]
+
+    qsubaddx    r10, r6, r7          ; [c1|a1] [9-10  |  8+11]
+    qaddsubx    r11, r6, r7          ; [b1|d1] [9+10  |  8-11]
+    qsubaddx    r12, r8, r9          ; [c1|a1] [13-14 | 12+15]
+    qaddsubx    lr, r8, r9           ; [b1|d1] [13+14 | 12-15]
+
+    qaddsubx    r6, r10, r11         ; [9 |10] [c1+d1 | a1-b1]
+    qaddsubx    r7, r11, r10         ; [8 |11] [b1+a1 | d1-c1]
+    qaddsubx    r8, r12, lr          ; [13|14] [c1+d1 | a1-b1]
+    qaddsubx    r9, lr, r12          ; [12|15] [b1+a1 | d1-c1]
+
+    ; first transform complete
+
+    qadd16      r10, r3, r9          ; a1 [0+12  |  3+15]
+    qadd16      r11, r5, r7          ; b1 [4+8   |  7+11]
+    qsub16      r12, r5, r7          ; c1 [4-8   |  7-11]
+    qsub16      lr, r3, r9           ; d1 [0-12  |  3-15]
+
+    qadd16      r3, r10, r11         ; a2 [a1+b1] [0 | 3]
+    qadd16      r5, r12, lr          ; b2 [c1+d1] [4 | 7]
+    qsub16      r7, r10, r11         ; c2 [a1-b1] [8 |11]
+    qsub16      r9, lr, r12          ; d2 [d1-c1] [12|15]
+
+    qadd16      r10, r2, r8          ; a1 [1+13  |  2+14]
+    qadd16      r11, r4, r6          ; b1 [5+9   |  6+10]
+    qsub16      r12, r4, r6          ; c1 [5-9   |  6-10]
+    qsub16      lr, r2, r8           ; d1 [1-13  |  2-14]
+
+    qadd16      r2, r10, r11         ; a2 [a1+b1] [1 | 2]
+    qadd16      r4, r12, lr          ; b2 [c1+d1] [5 | 6]
+    qsub16      r6, r10, r11         ; c2 [a1-b1] [9 |10]
+    qsub16      r8, lr, r12          ; d2 [d1-c1] [13|14]
+
+    ; [a-d]2 += ([a-d]2 > 0)
+
+    asrs        r10, r3, #16
+    addpl       r10, r10, #1         ; [~0]
+    asrs        r11, r2, #16
+    addpl       r11, r11, #1         ; [~1]
+    lsl         r11, r11, #15        ; [1  |  x]
+    pkhtb       r10, r11, r10, asr #1; [1  |  0]
+    str         r10, [r1], #4
+
+    lsls        r11, r2, #16
+    addpl       r11, r11, #0x10000   ; [~2]
+    lsls        r12, r3, #16
+    addpl       r12, r12, #0x10000   ; [~3]
+    asr         r12, r12, #1         ; [3  |  x]
+    pkhtb       r11, r12, r11, asr #17; [3  |  2]
+    str         r11, [r1], #4
+
+    asrs        r2, r5, #16
+    addpl       r2, r2, #1           ; [~4]
+    asrs        r3, r4, #16
+    addpl       r3, r3, #1           ; [~5]
+    lsl         r3, r3, #15          ; [5  |  x]
+    pkhtb       r2, r3, r2, asr #1   ; [5  |  4]
+    str         r2, [r1], #4
+
+    lsls        r2, r4, #16
+    addpl       r2, r2, #0x10000     ; [~6]
+    lsls        r3, r5, #16
+    addpl       r3, r3, #0x10000     ; [~7]
+    asr         r3, r3, #1           ; [7  |  x]
+    pkhtb       r2, r3, r2, asr #17  ; [7  |  6]
+    str         r2, [r1], #4
+
+    asrs        r2, r7, #16
+    addpl       r2, r2, #1           ; [~8]
+    asrs        r3, r6, #16
+    addpl       r3, r3, #1           ; [~9]
+    lsl         r3, r3, #15          ; [9  |  x]
+    pkhtb       r2, r3, r2, asr #1   ; [9  |  8]
+    str         r2, [r1], #4
+
+    lsls        r2, r6, #16
+    addpl       r2, r2, #0x10000     ; [~10]
+    lsls        r3, r7, #16
+    addpl       r3, r3, #0x10000     ; [~11]
+    asr         r3, r3, #1           ; [11 |  x]
+    pkhtb       r2, r3, r2, asr #17  ; [11 | 10]
+    str         r2, [r1], #4
+
+    asrs        r2, r9, #16
+    addpl       r2, r2, #1           ; [~12]
+    asrs        r3, r8, #16
+    addpl       r3, r3, #1           ; [~13]
+    lsl         r3, r3, #15          ; [13 |  x]
+    pkhtb       r2, r3, r2, asr #1   ; [13 | 12]
+    str         r2, [r1], #4
+
+    lsls        r2, r8, #16
+    addpl       r2, r2, #0x10000     ; [~14]
+    lsls        r3, r9, #16
+    addpl       r3, r3, #0x10000     ; [~15]
+    asr         r3, r3, #1           ; [15 |  x]
+    pkhtb       r2, r3, r2, asr #17  ; [15 | 14]
+    str         r2, [r1]
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_short_walsh4x4_armv6|
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/dct_arm.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/encoder/dct.h"
+
+#if HAVE_ARMV6
+
+void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
+{
+    vp8_fast_fdct4x4_armv6(input,   output,    pitch);
+    vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
+}
+
+#endif /* HAVE_ARMV6 */
+
+
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/dct_arm.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DCT_ARM_H
+#define DCT_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_fdct(vp8_short_walsh4x4_armv6);
+extern prototype_fdct(vp8_fast_fdct4x4_armv6);
+extern prototype_fdct(vp8_fast_fdct8x4_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
+#if HAVE_ARMV7
+extern prototype_fdct(vp8_short_fdct4x4_neon);
+extern prototype_fdct(vp8_short_fdct8x4_neon);
+extern prototype_fdct(vp8_fast_fdct4x4_neon);
+extern prototype_fdct(vp8_fast_fdct8x4_neon);
+extern prototype_fdct(vp8_short_walsh4x4_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_neon
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_neon
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_neon
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_neon
+
+#undef  vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon
+#endif
+
+#endif
+
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/encodemb_arm.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef ENCODEMB_ARM_H
+#define ENCODEMB_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_subb(vp8_subtract_b_armv6);
+extern prototype_submby(vp8_subtract_mby_armv6);
+extern prototype_submbuv(vp8_subtract_mbuv_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_armv6
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_armv6
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
+#if HAVE_ARMV7
+//extern prototype_berr(vp8_block_error_c);
+//extern prototype_mberr(vp8_mbblock_error_c);
+//extern prototype_mbuverr(vp8_mbuverror_c);
+
+extern prototype_subb(vp8_subtract_b_neon);
+extern prototype_submby(vp8_subtract_mby_neon);
+extern prototype_submbuv(vp8_subtract_mbuv_neon);
+
+//#undef  vp8_encodemb_berr
+//#define vp8_encodemb_berr vp8_block_error_c
+
+//#undef  vp8_encodemb_mberr
+//#define vp8_encodemb_mberr vp8_mbblock_error_c
+
+//#undef  vp8_encodemb_mbuverr
+//#define vp8_encodemb_mbuverr vp8_mbuverror_c
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_neon
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_neon
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_neon
+#endif
+
+#endif
+
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
@@ -0,0 +1,124 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_fdct4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+;NOTE:
+;The input *src_diff. src_diff is calculated as:
+;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
+;In which *src_ptr and *pred_ptr both are unsigned char.
+;Therefore, *src_diff should be in the range of [-255, 255].
+;CAUTION:
+;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
+;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
+;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
+
+|vp8_fast_fdct4x4_neon| PROC
+    vld1.16         {d2}, [r0], r2              ;load input
+    ldr             r12, _ffdct_coeff_
+    vld1.16         {d3}, [r0], r2
+    vld1.16         {d4}, [r0], r2
+    vld1.16         {d0}, [r12]
+    vld1.16         {d5}, [r0], r2
+
+    ;First for-loop
+    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vadd.s16        d6, d2, d5              ;ip[0]+ip[3]
+    vadd.s16        d7, d3, d4              ;ip[1]+ip[2]
+    vsub.s16        d8, d3, d4              ;ip[1]-ip[2]
+    vsub.s16        d9, d2, d5              ;ip[0]-ip[3]
+    vshl.i16        q3, q3, #1              ; a1, b1
+    vshl.i16        q4, q4, #1              ; c1, d1
+
+    vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
+    vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
+
+    vqdmulh.s16     q6, q5, d0[1]
+    vqdmulh.s16     q8, q4, d0[0]
+    vqdmulh.s16     q7, q4, d0[2]
+
+    vshr.s16        q6, q6, #1
+    vshr.s16        q8, q8, #1
+    vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
+    vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
+
+    vadd.s16        d2, d10, d12            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d4, d11, d13            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d3, d14, d17            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
+    vsub.s16        d5, d15, d16            ;op[3] = temp1 - temp2
+
+    ;Second for-loop
+    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[12]
+    vadd.s16        d7, d3, d4              ;b1 = ip[4]+ip[8]
+    vsub.s16        d8, d3, d4              ;c1 = ip[4]-ip[8]
+    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[12]
+
+    vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
+    vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
+
+
+    vqdmulh.s16     q6, q5, d0[1]
+    vqdmulh.s16     q8, q4, d0[0]
+    vqdmulh.s16     q7, q4, d0[2]
+
+    vshr.s16        q6, q6, #1
+    vshr.s16        q8, q8, #1
+    vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
+    vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
+
+    vadd.s16        d2, d10, d12            ;a2 = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d4, d11, d13            ;c2 = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d3, d14, d17            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
+    vsub.s16        d5, d15, d16            ;d2 = temp1 - temp2
+
+    vclt.s16        q3, q1, #0
+    vclt.s16        q4, q2, #0
+
+    vsub.s16        q1, q1, q3
+    vsub.s16        q2, q2, q4
+
+    vshr.s16        q1, q1, #1
+    vshr.s16        q2, q2, #1
+
+    vst1.16         {q1, q2}, [r1]
+
+    bx              lr
+
+    ENDP
+
+;-----------------
+
+_ffdct_coeff_
+    DCD     ffdct_coeff
+ffdct_coeff
+; 60547 =  0xEC83
+; 46341 =  0xB505
+; 25080 =  0x61F8
+    DCD     0xB505EC83, 0x000061F8
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
@@ -0,0 +1,177 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_fdct8x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+;NOTE:
+;The input *src_diff. src_diff is calculated as:
+;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
+;In which *src_ptr and *pred_ptr both are unsigned char.
+;Therefore, *src_diff should be in the range of [-255, 255].
+;CAUTION:
+;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
+;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
+;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
+
+|vp8_fast_fdct8x4_neon| PROC
+    vld1.16         {q1}, [r0], r2              ;load input
+    ldr             r12, _ffdct8_coeff_
+    vld1.16         {q2}, [r0], r2
+    vld1.16         {q3}, [r0], r2
+    vld1.16         {d0}, [r12]
+    vld1.16         {q4}, [r0], r2
+
+    ;First for-loop
+    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
+    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
+    vtrn.32         d2, d6
+    vtrn.32         d3, d7
+    vtrn.32         d4, d8
+    vtrn.32         d5, d9
+    vtrn.16         d2, d4
+    vtrn.16         d3, d5
+    vtrn.16         d6, d8
+    vtrn.16         d7, d9
+
+    vadd.s16        d10, d2, d8             ;ip[0]+ip[3]
+    vadd.s16        d11, d4, d6             ;ip[1]+ip[2]
+    vsub.s16        d12, d4, d6             ;ip[1]-ip[2]
+    vsub.s16        d13, d2, d8             ;ip[0]-ip[3]
+    vadd.s16        d22, d3, d9
+    vadd.s16        d23, d5, d7
+    vsub.s16        d24, d5, d7
+    vsub.s16        d25, d3, d9
+
+    vshl.i16        q5, q5, #1              ; a1, b1
+    vshl.i16        q6, q6, #1              ; c1, d1
+    vshl.i16        q1, q11, #1
+    vshl.i16        q2, q12, #1
+
+    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
+    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
+    vadd.s16        d24, d2, d3
+    vsub.s16        d25, d2, d3
+
+    vqdmulh.s16     q8, q7, d0[1]
+    vqdmulh.s16     q13, q12, d0[1]
+    vqdmulh.s16     q10, q6, d0[0]
+    vqdmulh.s16     q15, q2, d0[0]
+    vqdmulh.s16     q9, q6, d0[2]
+    vqdmulh.s16     q14, q2, d0[2]
+
+    vshr.s16        q8, q8, #1
+    vshr.s16        q13, q13, #1
+    vshr.s16        q10, q10, #1
+    vshr.s16        q15, q15, #1
+    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
+    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
+    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
+    vadd.s16        q15, q2, q15            ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
+
+    vadd.s16        d2, d14, d16            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d3, d24, d26            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d6, d15, d17            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d7, d25, d27            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d4, d18, d21            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
+    vadd.s16        d5, d28, d31            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
+    vsub.s16        d8, d19, d20            ;op[3] = temp1 - temp2
+    vsub.s16        d9, d29, d30            ;op[3] = temp1 - temp2
+
+    ;Second for-loop
+    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
+    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
+    vtrn.32         d2, d6
+    vtrn.32         d3, d7
+    vtrn.32         d4, d8
+    vtrn.32         d5, d9
+    vtrn.16         d2, d4
+    vtrn.16         d3, d5
+    vtrn.16         d6, d8
+    vtrn.16         d7, d9
+
+    vadd.s16        d10, d2, d8             ;a1 = ip[0]+ip[12]
+    vadd.s16        d11, d4, d6             ;b1 = ip[4]+ip[8]
+    vsub.s16        d12, d4, d6             ;c1 = ip[4]-ip[8]
+    vsub.s16        d13, d2, d8             ;d1 = ip[0]-ip[12]
+    vadd.s16        d2, d3, d9
+    vadd.s16        d4, d5, d7
+    vsub.s16        d24, d5, d7
+    vsub.s16        d25, d3, d9
+
+    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
+    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
+    vadd.s16        d22, d2, d4
+    vsub.s16        d23, d2, d4
+
+    vqdmulh.s16     q8, q7, d0[1]
+    vqdmulh.s16     q13, q11, d0[1]
+    vqdmulh.s16     q10, q6, d0[0]
+    vqdmulh.s16     q15, q12, d0[0]
+    vqdmulh.s16     q9, q6, d0[2]
+    vqdmulh.s16     q14, q12, d0[2]
+
+    vshr.s16        q8, q8, #1
+    vshr.s16        q13, q13, #1
+    vshr.s16        q10, q10, #1
+    vshr.s16        q15, q15, #1
+    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
+    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
+    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
+    vadd.s16        q15, q12, q15           ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
+
+    vadd.s16        d2, d14, d16            ;a2 = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d6, d22, d26            ;a2 = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d4, d15, d17            ;c2 = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d8, d23, d27            ;c2 = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d3, d18, d21            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
+    vadd.s16        d7, d28, d31            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
+    vsub.s16        d5, d19, d20            ;d2 = temp1 - temp2
+    vsub.s16        d9, d29, d30            ;d2 = temp1 - temp2
+
+    vclt.s16        q5, q1, #0
+    vclt.s16        q6, q2, #0
+    vclt.s16        q7, q3, #0
+    vclt.s16        q8, q4, #0
+
+    vsub.s16        q1, q1, q5
+    vsub.s16        q2, q2, q6
+    vsub.s16        q3, q3, q7
+    vsub.s16        q4, q4, q8
+
+    vshr.s16        q1, q1, #1
+    vshr.s16        q2, q2, #1
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vst1.16         {q1, q2}, [r1]!
+    vst1.16         {q3, q4}, [r1]
+
+    bx              lr
+
+    ENDP
+
+;-----------------
+
+_ffdct8_coeff_
+    DCD     ffdct8_coeff
+ffdct8_coeff
+; 60547 =  0xEC83
+; 46341 =  0xB505
+; 25080 =  0x61F8
+    DCD     0xB505EC83, 0x000061F8
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
@@ -0,0 +1,261 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_quantize_b_neon|
+    EXPORT  |vp8_fast_quantize_b_pair_neon|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=4
+
+;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
+|vp8_fast_quantize_b_pair_neon| PROC
+
+    stmfd           sp!, {r4-r9}
+    vstmdb          sp!, {q4-q7}
+
+    ldr             r4, [r0, #vp8_block_coeff]
+    ldr             r5, [r0, #vp8_block_quant_fast]
+    ldr             r6, [r0, #vp8_block_round]
+
+    vld1.16         {q0, q1}, [r4@128]  ; load z
+
+    ldr             r7, [r2, #vp8_blockd_qcoeff]
+
+    vabs.s16        q4, q0              ; calculate x = abs(z)
+    vabs.s16        q5, q1
+
+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+    vshr.s16        q2, q0, #15         ; sz
+    vshr.s16        q3, q1, #15
+
+    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
+    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
+
+    ldr             r4, [r1, #vp8_block_coeff]
+
+    vadd.s16        q4, q6              ; x + Round
+    vadd.s16        q5, q7
+
+    vld1.16         {q0, q1}, [r4@128]  ; load z2
+
+    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q5, q9
+
+    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
+    vabs.s16        q11, q1
+    vshr.s16        q12, q0, #15        ; sz2
+    vshr.s16        q13, q1, #15
+
+    ;modify data to have its original sign
+    veor.s16        q4, q2              ; y^sz
+    veor.s16        q5, q3
+
+    vadd.s16        q10, q6             ; x2 + Round
+    vadd.s16        q11, q7
+
+    ldr             r8, [r2, #vp8_blockd_dequant]
+
+    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q11, q9
+
+    vshr.s16        q4, #1              ; right shift 1 after vqdmulh
+    vshr.s16        q5, #1
+
+    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
+
+    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q5, q3
+
+    vshr.s16        q10, #1             ; right shift 1 after vqdmulh
+    vshr.s16        q11, #1
+
+    ldr             r9, [r2, #vp8_blockd_dqcoeff]
+
+    veor.s16        q10, q12            ; y2^sz2
+    veor.s16        q11, q13
+
+    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
+
+
+    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q11, q13
+
+    ldr             r6, [r3, #vp8_blockd_qcoeff]
+
+    vmul.s16        q2, q6, q4          ; x * Dequant
+    vmul.s16        q3, q7, q5
+
+    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table
+
+    vceq.s16        q8, q8              ; set q8 to all 1
+
+    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
+
+    vmul.s16        q12, q6, q10        ; x2 * Dequant
+    vmul.s16        q13, q7, q11
+
+    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
+
+    vtst.16         q14, q4, q8         ; now find eob
+    vtst.16         q15, q5, q8         ; non-zero element is set to all 1
+
+    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
+
+    ldr             r7, [r3, #vp8_blockd_dqcoeff]
+
+    vand            q0, q6, q14         ; get all valid numbers from scan array
+    vand            q1, q7, q15
+
+    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
+
+    vtst.16         q2, q10, q8         ; now find eob
+    vtst.16         q3, q11, q8         ; non-zero element is set to all 1
+
+    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
+
+    vand            q10, q6, q2         ; get all valid numbers from scan array
+    vand            q11, q7, q3
+    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
+
+    vmax.u16        d0, d0, d1
+    vmax.u16        d20, d20, d21
+    vmovl.u16       q0, d0
+    vmovl.u16       q10, d20
+
+
+    vmax.u32        d0, d0, d1
+    vmax.u32        d20, d20, d21
+    vpmax.u32       d0, d0, d0
+    vpmax.u32       d20, d20, d20
+
+    add             r4, r2, #vp8_blockd_eob
+    add             r5, r3, #vp8_blockd_eob
+
+    vst1.32         {d0[0]}, [r4@32]
+    vst1.32         {d20[0]}, [r5@32]
+
+    vldmia          sp!, {q4-q7}
+    ldmfd           sp!, {r4-r9}
+    bx              lr
+
+    ENDP
+
+;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+|vp8_fast_quantize_b_neon| PROC
+
+    stmfd           sp!, {r4-r7}
+
+    ldr             r3, [r0, #vp8_block_coeff]
+    ldr             r4, [r0, #vp8_block_quant_fast]
+    ldr             r5, [r0, #vp8_block_round]
+
+    vld1.16         {q0, q1}, [r3@128]  ; load z
+    vorr.s16        q14, q0, q1         ; check if all zero (step 1)
+    ldr             r6, [r1, #vp8_blockd_qcoeff]
+    ldr             r7, [r1, #vp8_blockd_dqcoeff]
+    vorr.s16        d28, d28, d29       ; check if all zero (step 2)
+
+    vabs.s16        q12, q0             ; calculate x = abs(z)
+    vabs.s16        q13, q1
+
+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+    vshr.s16        q2, q0, #15         ; sz
+    vmov            r2, r3, d28         ; check if all zero (step 3)
+    vshr.s16        q3, q1, #15
+
+    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
+    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
+
+    vadd.s16        q12, q14            ; x + Round
+    vadd.s16        q13, q15
+
+    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table
+
+    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q13, q9
+
+    vld1.16         {q10, q11}, [r0@128]; load inverse scan order
+
+    vceq.s16        q8, q8              ; set q8 to all 1
+
+    ldr             r4, [r1, #vp8_blockd_dequant]
+
+    vshr.s16        q12, #1             ; right shift 1 after vqdmulh
+    vshr.s16        q13, #1
+
+    orr             r2, r2, r3          ; check if all zero (step 4)
+    cmp             r2, #0              ; check if all zero (step 5)
+    beq             zero_output         ; check if all zero (step 6)
+
+    ;modify data to have its original sign
+    veor.s16        q12, q2             ; y^sz
+    veor.s16        q13, q3
+
+    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q13, q3
+
+    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
+
+    vtst.16         q14, q12, q8        ; now find eob
+    vtst.16         q15, q13, q8        ; non-zero element is set to all 1
+
+    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
+
+    vand            q10, q10, q14       ; get all valid numbers from scan array
+    vand            q11, q11, q15
+
+
+    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
+    vmax.u16        d0, d0, d1
+    vmovl.u16       q0, d0
+
+    vmul.s16        q2, q12             ; x * Dequant
+    vmul.s16        q3, q13
+
+    vmax.u32        d0, d0, d1
+    vpmax.u32       d0, d0, d0
+
+    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
+
+    add             r4, r1, #vp8_blockd_eob
+    vst1.32         {d0[0]}, [r4@32]
+
+    ldmfd           sp!, {r4-r7}
+    bx              lr
+
+zero_output
+    str             r2, [r1, #vp8_blockd_eob]
+    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
+    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
+
+    ldmfd           sp!, {r4-r7}
+    bx              lr
+
+    ENDP
+
+; default inverse zigzag table is defined in vp8/common/entropy.c
+_inv_zig_zag_
+    DCD inv_zig_zag
+
+    ALIGN 16    ; enable use of @128 bit aligned loads
+inv_zig_zag
+    DCW 0x0001, 0x0002, 0x0006, 0x0007
+    DCW 0x0003, 0x0005, 0x0008, 0x000d
+    DCW 0x0004, 0x0009, 0x000c, 0x000e
+    DCW 0x000a, 0x000b, 0x000f, 0x0010
+
+    END
+
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/sad16_neon.asm
@@ -0,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad16x16_neon|
+    EXPORT  |vp8_sad16x8_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int  src_stride
+; r2    unsigned char *ref_ptr
+; r3    int  ref_stride
+|vp8_sad16x16_neon| PROC
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+    vabdl.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0]
+    vld1.8          {q7}, [r2]
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vadd.u16        q0, q12, q13
+
+    vpaddl.u16      q1, q0
+    vpaddl.u32      q0, q1
+
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;==============================
+;unsigned int vp8_sad16x8_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+|vp8_sad16x8_neon| PROC
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+    vabdl.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vadd.u16        q0, q12, q13
+
+    vpaddl.u16      q1, q0
+    vpaddl.u32      q0, q1
+
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/sad8_neon.asm
@@ -0,0 +1,209 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad8x8_neon|
+    EXPORT  |vp8_sad8x16_neon|
+    EXPORT  |vp8_sad4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; unsigned int vp8_sad8x8_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad8x8_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      q1, q12
+    vpaddl.u32      q0, q1
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;============================
+;unsigned int vp8_sad8x16_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad8x16_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      q1, q12
+    vpaddl.u32      q0, q1
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;===========================
+;unsigned int vp8_sad4x4_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad4x4_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      d1, d24
+    vpaddl.u32      d0, d1
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -0,0 +1,144 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_fdct4x4_neon|
+    EXPORT  |vp8_short_fdct8x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    short *input
+; r1    short *output
+; r2    int pitch
+; Input has a pitch, output is contiguous
+|vp8_short_fdct4x4_neon| PROC
+    ldr             r12, _dct_matrix_
+    vld1.16         d0, [r0], r2
+    vld1.16         d1, [r0], r2
+    vld1.16         d2, [r0], r2
+    vld1.16         d3, [r0]
+    vld1.16         {q2, q3}, [r12]
+
+;first stage
+    vmull.s16       q11, d4, d0[0]              ;i=0
+    vmull.s16       q12, d4, d1[0]              ;i=1
+    vmull.s16       q13, d4, d2[0]              ;i=2
+    vmull.s16       q14, d4, d3[0]              ;i=3
+
+    vmlal.s16       q11, d5, d0[1]
+    vmlal.s16       q12, d5, d1[1]
+    vmlal.s16       q13, d5, d2[1]
+    vmlal.s16       q14, d5, d3[1]
+
+    vmlal.s16       q11, d6, d0[2]
+    vmlal.s16       q12, d6, d1[2]
+    vmlal.s16       q13, d6, d2[2]
+    vmlal.s16       q14, d6, d3[2]
+
+    vmlal.s16       q11, d7, d0[3]              ;sumtemp for i=0
+    vmlal.s16       q12, d7, d1[3]              ;sumtemp for i=1
+    vmlal.s16       q13, d7, d2[3]              ;sumtemp for i=2
+    vmlal.s16       q14, d7, d3[3]              ;sumtemp for i=3
+
+    ; rounding
+    vrshrn.i32      d22, q11, #14
+    vrshrn.i32      d24, q12, #14
+    vrshrn.i32      d26, q13, #14
+    vrshrn.i32      d28, q14, #14
+
+;second stage
+    vmull.s16       q4, d22, d4[0]              ;i=0
+    vmull.s16       q5, d22, d4[1]              ;i=1
+    vmull.s16       q6, d22, d4[2]              ;i=2
+    vmull.s16       q7, d22, d4[3]              ;i=3
+
+    vmlal.s16       q4, d24, d5[0]
+    vmlal.s16       q5, d24, d5[1]
+    vmlal.s16       q6, d24, d5[2]
+    vmlal.s16       q7, d24, d5[3]
+
+    vmlal.s16       q4, d26, d6[0]
+    vmlal.s16       q5, d26, d6[1]
+    vmlal.s16       q6, d26, d6[2]
+    vmlal.s16       q7, d26, d6[3]
+
+    vmlal.s16       q4, d28, d7[0]              ;sumtemp for i=0
+    vmlal.s16       q5, d28, d7[1]              ;sumtemp for i=1
+    vmlal.s16       q6, d28, d7[2]              ;sumtemp for i=2
+    vmlal.s16       q7, d28, d7[3]              ;sumtemp for i=3
+
+    vrshr.s32       q0, q4, #16
+    vrshr.s32       q1, q5, #16
+    vrshr.s32       q2, q6, #16
+    vrshr.s32       q3, q7, #16
+
+    vmovn.i32       d0, q0
+    vmovn.i32       d1, q1
+    vmovn.i32       d2, q2
+    vmovn.i32       d3, q3
+
+    vst1.16         {q0, q1}, [r1]
+
+    bx              lr
+
+    ENDP
+
+; r0    short *input
+; r1    short *output
+; r2    int pitch
+|vp8_short_fdct8x4_neon| PROC
+    ; Store link register and input before calling
+    ;  first 4x4 fdct.  Do not need to worry about
+    ;  output or pitch because those pointers are not
+    ;  touched in the 4x4 fdct function
+    stmdb           sp!, {r0, lr}
+
+    bl              vp8_short_fdct4x4_neon
+
+    ldmia           sp!, {r0, lr}
+
+    ; Move to the next block of data.
+    add             r0, r0, #8
+    add             r1, r1, #32
+
+    ; Second time through do not store off the
+    ;  link register, just return from the 4x4 fdtc
+    b               vp8_short_fdct4x4_neon
+
+    ; Should never get to this.
+    bx              lr
+
+    ENDP
+
+;-----------------
+
+_dct_matrix_
+    DCD     dct_matrix
+dct_matrix
+;   DCW     23170,  30274,  23170, 12540
+;   DCW     23170,  12540, -23170,-30274
+;   DCW     23170, -12540, -23170, 30274
+;   DCW     23170, -30274,  23170,-12540
+; 23170 =  0x5a82
+; -23170 =  0xa57e
+; 30274 =  0x7642
+; -30274 =  0x89be
+; 12540 =  0x30fc
+; -12540 = 0xcf04
+    DCD     0x76425a82, 0x30fc5a82
+    DCD     0x30fc5a82, 0x89bea57e
+    DCD     0xcf045a82, 0x7642a57e
+    DCD     0x89be5a82, 0xcf045a82
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
@@ -0,0 +1,185 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_subtract_b_neon|
+    EXPORT |vp8_subtract_mby_neon|
+    EXPORT |vp8_subtract_mbuv_neon|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+|vp8_subtract_b_neon| PROC
+
+    stmfd   sp!, {r4-r7}
+
+    ldr     r3, [r0, #vp8_block_base_src]
+    ldr     r4, [r0, #vp8_block_src]
+    ldr     r5, [r0, #vp8_block_src_diff]
+    ldr     r3, [r3]
+    ldr     r6, [r0, #vp8_block_src_stride]
+    add     r3, r3, r4                      ; src = *base_src + src
+    ldr     r7, [r1, #vp8_blockd_predictor]
+
+    vld1.8          {d0}, [r3], r6          ;load src
+    vld1.8          {d1}, [r7], r2          ;load pred
+    vld1.8          {d2}, [r3], r6
+    vld1.8          {d3}, [r7], r2
+    vld1.8          {d4}, [r3], r6
+    vld1.8          {d5}, [r7], r2
+    vld1.8          {d6}, [r3], r6
+    vld1.8          {d7}, [r7], r2
+
+    vsubl.u8        q10, d0, d1
+    vsubl.u8        q11, d2, d3
+    vsubl.u8        q12, d4, d5
+    vsubl.u8        q13, d6, d7
+
+    mov             r2, r2, lsl #1
+
+    vst1.16         {d20}, [r5], r2         ;store diff
+    vst1.16         {d22}, [r5], r2
+    vst1.16         {d24}, [r5], r2
+    vst1.16         {d26}, [r5], r2
+
+    ldmfd   sp!, {r4-r7}
+    bx              lr
+
+    ENDP
+
+
+;==========================================
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
+|vp8_subtract_mby_neon| PROC
+    mov             r12, #4
+
+subtract_mby_loop
+    vld1.8          {q0}, [r1], r3          ;load src
+    vld1.8          {q1}, [r2]!             ;load pred
+    vld1.8          {q2}, [r1], r3
+    vld1.8          {q3}, [r2]!
+    vld1.8          {q4}, [r1], r3
+    vld1.8          {q5}, [r2]!
+    vld1.8          {q6}, [r1], r3
+    vld1.8          {q7}, [r2]!
+
+    vsubl.u8        q8, d0, d2
+    vsubl.u8        q9, d1, d3
+    vsubl.u8        q10, d4, d6
+    vsubl.u8        q11, d5, d7
+    vsubl.u8        q12, d8, d10
+    vsubl.u8        q13, d9, d11
+    vsubl.u8        q14, d12, d14
+    vsubl.u8        q15, d13, d15
+
+    vst1.16         {q8}, [r0]!             ;store diff
+    vst1.16         {q9}, [r0]!
+    vst1.16         {q10}, [r0]!
+    vst1.16         {q11}, [r0]!
+    vst1.16         {q12}, [r0]!
+    vst1.16         {q13}, [r0]!
+    vst1.16         {q14}, [r0]!
+    vst1.16         {q15}, [r0]!
+
+    subs            r12, r12, #1
+    bne             subtract_mby_loop
+
+    bx              lr
+    ENDP
+
+;=================================
+;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+|vp8_subtract_mbuv_neon| PROC
+    ldr             r12, [sp]
+
+;u
+    add             r0, r0, #512        ;   short *udiff = diff + 256;
+    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
+
+    vld1.8          {d0}, [r1], r12         ;load src
+    vld1.8          {d1}, [r3]!             ;load pred
+    vld1.8          {d2}, [r1], r12
+    vld1.8          {d3}, [r3]!
+    vld1.8          {d4}, [r1], r12
+    vld1.8          {d5}, [r3]!
+    vld1.8          {d6}, [r1], r12
+    vld1.8          {d7}, [r3]!
+    vld1.8          {d8}, [r1], r12
+    vld1.8          {d9}, [r3]!
+    vld1.8          {d10}, [r1], r12
+    vld1.8          {d11}, [r3]!
+    vld1.8          {d12}, [r1], r12
+    vld1.8          {d13}, [r3]!
+    vld1.8          {d14}, [r1], r12
+    vld1.8          {d15}, [r3]!
+
+    vsubl.u8        q8, d0, d1
+    vsubl.u8        q9, d2, d3
+    vsubl.u8        q10, d4, d5
+    vsubl.u8        q11, d6, d7
+    vsubl.u8        q12, d8, d9
+    vsubl.u8        q13, d10, d11
+    vsubl.u8        q14, d12, d13
+    vsubl.u8        q15, d14, d15
+
+    vst1.16         {q8}, [r0]!             ;store diff
+    vst1.16         {q9}, [r0]!
+    vst1.16         {q10}, [r0]!
+    vst1.16         {q11}, [r0]!
+    vst1.16         {q12}, [r0]!
+    vst1.16         {q13}, [r0]!
+    vst1.16         {q14}, [r0]!
+    vst1.16         {q15}, [r0]!
+
+;v
+    vld1.8          {d0}, [r2], r12         ;load src
+    vld1.8          {d1}, [r3]!             ;load pred
+    vld1.8          {d2}, [r2], r12
+    vld1.8          {d3}, [r3]!
+    vld1.8          {d4}, [r2], r12
+    vld1.8          {d5}, [r3]!
+    vld1.8          {d6}, [r2], r12
+    vld1.8          {d7}, [r3]!
+    vld1.8          {d8}, [r2], r12
+    vld1.8          {d9}, [r3]!
+    vld1.8          {d10}, [r2], r12
+    vld1.8          {d11}, [r3]!
+    vld1.8          {d12}, [r2], r12
+    vld1.8          {d13}, [r3]!
+    vld1.8          {d14}, [r2], r12
+    vld1.8          {d15}, [r3]!
+
+    vsubl.u8        q8, d0, d1
+    vsubl.u8        q9, d2, d3
+    vsubl.u8        q10, d4, d5
+    vsubl.u8        q11, d6, d7
+    vsubl.u8        q12, d8, d9
+    vsubl.u8        q13, d10, d11
+    vsubl.u8        q14, d12, d13
+    vsubl.u8        q15, d14, d15
+
+    vst1.16         {q8}, [r0]!             ;store diff
+    vst1.16         {q9}, [r0]!
+    vst1.16         {q10}, [r0]!
+    vst1.16         {q11}, [r0]!
+    vst1.16         {q12}, [r0]!
+    vst1.16         {q13}, [r0]!
+    vst1.16         {q14}, [r0]!
+    vst1.16         {q15}, [r0]!
+
+    bx              lr
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/variance_neon.asm
@@ -0,0 +1,276 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_neon|
+    EXPORT  |vp8_variance16x8_neon|
+    EXPORT  |vp8_variance8x16_neon|
+    EXPORT  |vp8_variance8x8_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+variance16x16_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
+    ;the results into the elements of the destination vector. The explanation
+    ;in ARM guide is wrong.
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance16x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
+    ;vmov.32        r1, d1[0]
+    ;mul            r0, r0, r0
+    ;str            r1, [r12]
+    ;sub            r0, r1, r0, asr #8
+
+    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
+    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;================================
+;unsigned int vp8_variance16x8_c(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;   unsigned int *sse)
+|vp8_variance16x8_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #4
+
+variance16x8_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance16x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #7
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;=================================
+;unsigned int vp8_variance8x16_c(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;   unsigned int *sse)
+
+|vp8_variance8x16_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+variance8x16_neon_loop
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d2, d6
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+
+    bne             variance8x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #7
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;==================================
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #2
+
+variance8x8_neon_loop
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d1}, [r0], r1
+    vld1.8          {d5}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d3}, [r0], r1
+    vld1.8          {d7}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance8x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #6
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -0,0 +1,68 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_memcpy_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+|vp8_memcpy_neon| PROC
+    ;pld                [r1]                        ;preload pred data
+    ;pld                [r1, #128]
+    ;pld                [r1, #256]
+    ;pld                [r1, #384]
+
+    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time
+
+memcpy_neon_loop
+    vld1.8          {q0, q1}, [r1]!                 ;load src data
+    subs            r12, r12, #1
+    vld1.8          {q2, q3}, [r1]!
+    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr
+    vld1.8          {q4, q5}, [r1]!
+    vst1.8          {q2, q3}, [r0]!
+    vld1.8          {q6, q7}, [r1]!
+    vst1.8          {q4, q5}, [r0]!
+    vld1.8          {q8, q9}, [r1]!
+    vst1.8          {q6, q7}, [r0]!
+    vld1.8          {q10, q11}, [r1]!
+    vst1.8          {q8, q9}, [r0]!
+    vld1.8          {q12, q13}, [r1]!
+    vst1.8          {q10, q11}, [r0]!
+    vld1.8          {q14, q15}, [r1]!
+    vst1.8          {q12, q13}, [r0]!
+    vst1.8          {q14, q15}, [r0]!
+
+    ;pld                [r1]                        ;preload pred data -- need to adjust for real device
+    ;pld                [r1, #128]
+    ;pld                [r1, #256]
+    ;pld                [r1, #384]
+
+    bne             memcpy_neon_loop
+
+    ands            r3, r2, #0xff                   ;extra copy
+    beq             done_copy_neon_loop
+
+extra_copy_neon_loop
+    vld1.8          {q0}, [r1]!                 ;load src data
+    subs            r3, r3, #16
+    vst1.8          {q0}, [r0]!
+    bne             extra_copy_neon_loop
+
+done_copy_neon_loop
+    bx              lr
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_neon|
+    EXPORT  |vp8_get4x4sse_cs_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;============================
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;note: in this function, sum is never used. So, we can remove this part of calculation
+;from vp8_variance().
+
+|vp8_mse16x16_neon| PROC
+    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
+    vmov.i8         q8, #0
+    vmov.i8         q9, #0
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+mse16x16_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vmlal.s16       q7, d22, d22
+    vmlal.s16       q8, d23, d23
+
+    subs            r12, r12, #1
+
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vmlal.s16       q7, d26, d26
+    vmlal.s16       q8, d27, d27
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             mse16x16_neon_loop
+
+    vadd.u32        q7, q7, q8
+    vadd.u32        q9, q9, q10
+
+    ldr             r12, [sp]               ;load *sse from stack
+
+    vadd.u32        q10, q7, q9
+    vpaddl.u32      q1, q10
+    vadd.u64        d0, d2, d3
+
+    vst1.32         {d0[0]}, [r12]
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+
+;=============================
+; r0    unsigned char *src_ptr,
+; r1    int  source_stride,
+; r2    unsigned char *ref_ptr,
+; r3    int  recon_stride
+|vp8_get4x4sse_cs_neon| PROC
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d1}, [r0], r1
+    vld1.8          {d5}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d3}, [r0], r1
+    vld1.8          {d7}, [r2], r3
+
+    vsubl.u8        q11, d0, d4
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vmull.s16       q7, d22, d22
+    vmull.s16       q8, d24, d24
+    vmull.s16       q9, d26, d26
+    vmull.s16       q10, d28, d28
+
+    vadd.u32        q7, q7, q8
+    vadd.u32        q9, q9, q10
+    vadd.u32        q9, q7, q9
+
+    vpaddl.u32      q1, q9
+    vadd.u64        d0, d2, d3
+
+    vmov.32         r0, d0[0]
+    bx              lr
+
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
@@ -0,0 +1,76 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_walsh4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
+
+|vp8_short_walsh4x4_neon| PROC
+    vld1.16         {d2}, [r0], r2              ;load input
+    vld1.16         {d3}, [r0], r2
+    vld1.16         {d4}, [r0], r2
+    vld1.16         {d5}, [r0], r2
+
+    ;First for-loop
+    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[3]
+    vadd.s16        d7, d3, d4              ;b1 = ip[1]+ip[2]
+    vsub.s16        d8, d3, d4              ;c1 = ip[1]-ip[2]
+    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[3]
+
+    vadd.s16        d2, d6, d7             ;op[0] = a1 + b1
+    vsub.s16        d4, d6, d7             ;op[2] = a1 - b1
+    vadd.s16        d3, d8, d9             ;op[1] = c1 + d1
+    vsub.s16        d5, d9, d8             ;op[3] = d1 - c1
+
+    ;Second for-loop
+    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[12]
+    vadd.s16        d7, d3, d4              ;b1 = ip[4]+ip[8]
+    vsub.s16        d8, d3, d4              ;c1 = ip[4]-ip[8]
+    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[12]
+
+    vadd.s16        d2, d6, d7              ;a2 = a1 + b1;
+    vsub.s16        d4, d6, d7              ;c2 = a1 - b1;
+    vadd.s16        d3, d8, d9              ;b2 = c1 + d1;
+    vsub.s16        d5, d9, d8              ;d2 = d1 - c1;
+
+    vcgt.s16        q3, q1, #0
+    vcgt.s16        q4, q2, #0
+
+    vsub.s16        q1, q1, q3
+    vsub.s16        q2, q2, q4
+
+    vshr.s16        q1, q1, #1
+    vshr.s16        q2, q2, #1
+
+    vst1.16         {q1, q2}, [r1]
+
+    bx              lr
+
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -0,0 +1,425 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sub_pixel_variance16x16_neon_func|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
+
+|vp8_sub_pixel_variance16x16_neon_func| PROC
+    push            {r4-r6, lr}
+
+    ldr             r12, _BilinearTaps_coeff_
+    ldr             r4, [sp, #16]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack
+    ldr             r6, [sp, #24]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16_only
+
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {d31}, [r2]             ;load first_pass filter
+
+    beq             firstpass_bfilter16x16_only
+
+    sub             sp, sp, #272            ;reserve space on stack for temporary storage
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    mov             lr, sp
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    mov             r2, #3                  ;loop counter
+    vld1.u8         {d8, d9, d10}, [r0], r1
+
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vqrshrn.u16    d21, q14, #7
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vst1.u8         {d18, d19, d20, d21}, [lr]!
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    bne             vp8e_filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+    vld1.u8         {d14, d15, d16}, [r0], r1
+
+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q10, d3, d0
+    vmull.u8        q11, d5, d0
+    vmull.u8        q12, d6, d0
+    vmull.u8        q13, d8, d0
+    vmull.u8        q14, d9, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+
+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q11, d5, d1
+    vmlal.u8        q13, d8, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+
+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q12, d6, d1
+    vmlal.u8        q14, d9, d1
+
+    vmull.u8        q1, d11, d0
+    vmull.u8        q2, d12, d0
+    vmull.u8        q3, d14, d0
+    vmull.u8        q4, d15, d0
+
+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
+    vext.8          d14, d14, d15, #1
+
+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q3, d14, d1
+
+    vext.8          d12, d12, d13, #1
+    vext.8          d15, d15, d16, #1
+
+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q4, d15, d1
+
+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d11, q10, #7
+    vqrshrn.u16    d12, q11, #7
+    vqrshrn.u16    d13, q12, #7
+    vqrshrn.u16    d14, q13, #7
+    vqrshrn.u16    d15, q14, #7
+    vqrshrn.u16    d16, q1, #7
+    vqrshrn.u16    d17, q2, #7
+    vqrshrn.u16    d18, q3, #7
+    vqrshrn.u16    d19, q4, #7
+
+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
+    vst1.u8         {d14, d15, d16, d17}, [lr]!
+    vst1.u8         {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    sub             lr, lr, #272
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    sub             sp, sp, #256
+    mov             r3, sp
+
+    vld1.u8         {d22, d23}, [lr]!       ;load src data
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r12, #4                 ;loop counter
+
+vp8e_filt_blk2d_sp16x16_loop_neon
+    vld1.u8         {d24, d25}, [lr]!
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vld1.u8         {d26, d27}, [lr]!
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [lr]!
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [lr]!
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    subs            r12, r12, #1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r3]!         ;store result
+    vst1.u8         {d4, d5}, [r3]!
+    vst1.u8         {d6, d7}, [r3]!
+    vmov            q11, q15
+    vst1.u8         {d8, d9}, [r3]!
+
+    bne             vp8e_filt_blk2d_sp16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;--------------------
+firstpass_bfilter16x16_only
+    mov             r2, #4                      ;loop counter
+    sub             sp, sp, #528            ;reserve space on stack for temporary storage
+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vld1.u8         {d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+    vst1.u8         {d14, d15}, [r3]!       ;store result
+    vqrshrn.u16    d21, q14, #7
+
+    vst1.u8         {d16, d17}, [r3]!
+    vst1.u8         {d18, d19}, [r3]!
+    vst1.u8         {d20, d21}, [r3]!
+
+    bne             vp8e_filt_blk2d_fpo16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+    sub             sp, sp, #528            ;reserve space on stack for temporary storage
+    add             r3, r12, r3, lsl #3
+    mov             r12, #4                     ;loop counter
+    vld1.u32        {d31}, [r3]                 ;load second_pass filter
+    vld1.u8         {d22, d23}, [r0], r1        ;load src data
+    mov             r3, sp
+
+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+vp8e_filt_blk2d_spo16x16_loop_neon
+    vld1.u8         {d24, d25}, [r0], r1
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vld1.u8         {d26, d27}, [r0], r1
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [r0], r1
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [r0], r1
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r3]!         ;store result
+    subs            r12, r12, #1
+    vst1.u8         {d4, d5}, [r3]!
+    vmov            q11, q15
+    vst1.u8         {d6, d7}, [r3]!
+    vst1.u8         {d8, d9}, [r3]!
+
+    bne             vp8e_filt_blk2d_spo16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    sub             r3, r3, #256
+    mov             r12, #8
+
+sub_pixel_variance16x16_neon_loop
+    vld1.8          {q0}, [r3]!                 ;Load up source and reference
+    vld1.8          {q2}, [r4], r5
+    vld1.8          {q1}, [r3]!
+    vld1.8          {q3}, [r4], r5
+
+    vsubl.u8        q11, d0, d4                 ;diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             sub_pixel_variance16x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r6]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    add             sp, sp, #528
+    vmov.32         r0, d0[0]                   ;return
+
+    pop             {r4-r6,pc}
+
+    ENDP
+
+;-----------------
+
+_BilinearTaps_coeff_
+    DCD     bilinear_taps_coeff
+bilinear_taps_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -0,0 +1,572 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_h_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_v_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_hv_neon|
+    EXPORT  |vp8_sub_pixel_variance16x16s_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;================================================
+;unsigned int vp8_variance_halfpixvar16x16_h_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp8_variance_halfpixvar16x16_h_neon| PROC
+    push            {lr}
+
+    mov             r12, #4                  ;loop counter
+    ldr             lr, [sp, #4]           ;load *sse from stack
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8_filt_fpo16x16s_4_0_loop_neon
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    vld1.8          {q11}, [r2], r3
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.8          {q12}, [r2], r3
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.8          {q13}, [r2], r3
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    vext.8          q3, q2, q3, #1
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vld1.8          {q14}, [r2], r3
+    vrhadd.u8       q1, q2, q3
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+
+    vsubl.u8        q4, d0, d22                 ;diff
+    vsubl.u8        q5, d1, d23
+    vsubl.u8        q6, d2, d24
+    vsubl.u8        q7, d3, d25
+    vsubl.u8        q0, d4, d26
+    vsubl.u8        q1, d5, d27
+    vsubl.u8        q2, d6, d28
+    vsubl.u8        q3, d7, d29
+
+    vpadal.s16      q8, q4                     ;sum
+    vmlal.s16       q9, d8, d8                ;sse
+    vmlal.s16       q10, d9, d9
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q5
+    vmlal.s16       q9, d10, d10
+    vmlal.s16       q10, d11, d11
+    vpadal.s16      q8, q6
+    vmlal.s16       q9, d12, d12
+    vmlal.s16       q10, d13, d13
+    vpadal.s16      q8, q7
+    vmlal.s16       q9, d14, d14
+    vmlal.s16       q10, d15, d15
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             vp8_filt_fpo16x16s_4_0_loop_neon
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;================================================
+;unsigned int vp8_variance_halfpixvar16x16_v_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp8_variance_halfpixvar16x16_v_neon| PROC
+    push            {lr}
+
+    mov             r12, #4                     ;loop counter
+
+    vld1.u8         {q0}, [r0], r1              ;load src data
+    ldr             lr, [sp, #4]                ;load *sse from stack
+
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+vp8_filt_spo16x16s_0_4_loop_neon
+    vld1.u8         {q2}, [r0], r1
+    vld1.8          {q1}, [r2], r3
+    vld1.u8         {q4}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+    vld1.u8         {q6}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+    vld1.u8         {q15}, [r0], r1
+
+    vrhadd.u8       q0, q0, q2
+    vld1.8          {q7}, [r2], r3
+    vrhadd.u8       q2, q2, q4
+    vrhadd.u8       q4, q4, q6
+    vrhadd.u8       q6, q6, q15
+
+    vsubl.u8        q11, d0, d2                 ;diff
+    vsubl.u8        q12, d1, d3
+    vsubl.u8        q13, d4, d6
+    vsubl.u8        q14, d5, d7
+    vsubl.u8        q0, d8, d10
+    vsubl.u8        q1, d9, d11
+    vsubl.u8        q2, d12, d14
+    vsubl.u8        q3, d13, d15
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                 ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+
+    vmov            q0, q15
+
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             vp8_filt_spo16x16s_0_4_loop_neon
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;================================================
+;unsigned int vp8_variance_halfpixvar16x16_hv_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp8_variance_halfpixvar16x16_hv_neon| PROC
+    push            {lr}
+
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+
+    ldr             lr, [sp, #4]           ;load *sse from stack
+    vmov.i8         q13, #0                      ;q8 - sum
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+
+    vmov.i8         q14, #0                      ;q9, q10 - sse
+    vmov.i8         q15, #0
+
+    mov             r12, #4                  ;loop counter
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8_filt16x16s_4_4_loop_neon
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+    vext.8          q9, q8, q9, #1
+
+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+
+    vld1.8          {q5}, [r2], r3
+    vrhadd.u8       q0, q0, q1
+    vld1.8          {q6}, [r2], r3
+    vrhadd.u8       q1, q1, q2
+    vld1.8          {q7}, [r2], r3
+    vrhadd.u8       q2, q2, q3
+    vld1.8          {q8}, [r2], r3
+    vrhadd.u8       q3, q3, q4
+
+    vsubl.u8        q9, d0, d10                 ;diff
+    vsubl.u8        q10, d1, d11
+    vsubl.u8        q11, d2, d12
+    vsubl.u8        q12, d3, d13
+
+    vsubl.u8        q0, d4, d14                 ;diff
+    vsubl.u8        q1, d5, d15
+    vsubl.u8        q5, d6, d16
+    vsubl.u8        q6, d7, d17
+
+    vpadal.s16      q13, q9                     ;sum
+    vmlal.s16       q14, d18, d18                ;sse
+    vmlal.s16       q15, d19, d19
+
+    vpadal.s16      q13, q10                     ;sum
+    vmlal.s16       q14, d20, d20                ;sse
+    vmlal.s16       q15, d21, d21
+
+    vpadal.s16      q13, q11                     ;sum
+    vmlal.s16       q14, d22, d22                ;sse
+    vmlal.s16       q15, d23, d23
+
+    vpadal.s16      q13, q12                     ;sum
+    vmlal.s16       q14, d24, d24                ;sse
+    vmlal.s16       q15, d25, d25
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q13, q0                     ;sum
+    vmlal.s16       q14, d0, d0                ;sse
+    vmlal.s16       q15, d1, d1
+
+    vpadal.s16      q13, q1                     ;sum
+    vmlal.s16       q14, d2, d2                ;sse
+    vmlal.s16       q15, d3, d3
+
+    vpadal.s16      q13, q5                     ;sum
+    vmlal.s16       q14, d10, d10                ;sse
+    vmlal.s16       q15, d11, d11
+
+    vmov            q0, q4
+
+    vpadal.s16      q13, q6                     ;sum
+    vmlal.s16       q14, d12, d12                ;sse
+    vmlal.s16       q15, d13, d13
+
+    bne             vp8_filt16x16s_4_4_loop_neon
+
+    vadd.u32        q15, q14, q15                ;accumulate sse
+    vpaddl.s32      q0, q13                      ;accumulate sum
+
+    vpaddl.u32      q1, q15
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;==============================
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pixels_per_line,
+; stack unsigned int *sse
+;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
+;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
+;or filter coeff is {64, 64}. This simplified program only works in this situation.
+;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
+
+|vp8_sub_pixel_variance16x16s_neon| PROC
+    push            {r4, lr}
+
+    ldr             r4, [sp, #8]            ;load *dst_ptr from stack
+    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #16]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16s_only
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             firstpass_bfilter16x16s_only
+
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    mov             r3, sp
+    mov             r2, #4                  ;loop counter
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16s_loop_neon
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+    vext.8          q9, q8, q9, #1
+
+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+
+    vrhadd.u8       q0, q0, q1
+    vrhadd.u8       q1, q1, q2
+    vrhadd.u8       q2, q2, q3
+    vrhadd.u8       q3, q3, q4
+
+    subs            r2, r2, #1
+    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result
+    vmov            q0, q4
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+
+    bne             vp8e_filt_blk2d_fp16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;--------------------
+firstpass_bfilter16x16s_only
+    mov             r2, #2                  ;loop counter
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+    mov             r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16s_loop_neon
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+    vext.8          q3, q2, q3, #1
+    vld1.u8         {d20, d21, d22, d23}, [r0], r1
+    vext.8          q5, q4, q5, #1
+    vld1.u8         {d24, d25, d26, d27}, [r0], r1
+    vext.8          q7, q6, q7, #1
+    vld1.u8         {d28, d29, d30, d31}, [r0], r1
+    vext.8          q9, q8, q9, #1
+    vext.8          q11, q10, q11, #1
+    vext.8          q13, q12, q13, #1
+    vext.8          q15, q14, q15, #1
+
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q1, q2, q3
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+    vrhadd.u8       q5, q10, q11
+    vrhadd.u8       q6, q12, q13
+    vrhadd.u8       q7, q14, q15
+
+    subs            r2, r2, #1
+
+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+    vst1.u8         {d8, d9, d10, d11}, [r3]!
+    vst1.u8         {d12, d13, d14, d15}, [r3]!
+
+    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;---------------------
+secondpass_bfilter16x16s_only
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+
+    mov             r2, #2                  ;loop counter
+    vld1.u8         {d0, d1}, [r0], r1      ;load src data
+    mov             r3, sp
+
+vp8e_filt_blk2d_spo16x16s_loop_neon
+    vld1.u8         {d2, d3}, [r0], r1
+    vld1.u8         {d4, d5}, [r0], r1
+    vld1.u8         {d6, d7}, [r0], r1
+    vld1.u8         {d8, d9}, [r0], r1
+
+    vrhadd.u8       q0, q0, q1
+    vld1.u8         {d10, d11}, [r0], r1
+    vrhadd.u8       q1, q1, q2
+    vld1.u8         {d12, d13}, [r0], r1
+    vrhadd.u8       q2, q2, q3
+    vld1.u8         {d14, d15}, [r0], r1
+    vrhadd.u8       q3, q3, q4
+    vld1.u8         {d16, d17}, [r0], r1
+    vrhadd.u8       q4, q4, q5
+    vrhadd.u8       q5, q5, q6
+    vrhadd.u8       q6, q6, q7
+    vrhadd.u8       q7, q7, q8
+
+    subs            r2, r2, #1
+
+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
+    vmov            q0, q8
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result
+    vst1.u8         {d12, d13, d14, d15}, [r3]!
+
+    bne             vp8e_filt_blk2d_spo16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16s_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    sub             r3, r3, #256
+    mov             r2, #4
+
+sub_pixel_variance16x16s_neon_loop
+    vld1.8          {q0}, [r3]!                 ;Load up source and reference
+    vld1.8          {q1}, [r4], r12
+    vld1.8          {q2}, [r3]!
+    vld1.8          {q3}, [r4], r12
+    vld1.8          {q4}, [r3]!
+    vld1.8          {q5}, [r4], r12
+    vld1.8          {q6}, [r3]!
+    vld1.8          {q7}, [r4], r12
+
+    vsubl.u8        q11, d0, d2                 ;diff
+    vsubl.u8        q12, d1, d3
+    vsubl.u8        q13, d4, d6
+    vsubl.u8        q14, d5, d7
+    vsubl.u8        q0, d8, d10
+    vsubl.u8        q1, d9, d11
+    vsubl.u8        q2, d12, d14
+    vsubl.u8        q3, d13, d15
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r2, r2, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             sub_pixel_variance16x16s_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    add             sp, sp, #256
+    vmov.32         r0, d0[0]                   ;return
+
+    pop             {r4, pc}
+    ENDP
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -0,0 +1,224 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sub_pixel_variance8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
+
+|vp8_sub_pixel_variance8x8_neon| PROC
+    push            {r4-r5, lr}
+
+    ldr             r12, _BilinearTaps_coeff_
+    ldr             r4, [sp, #12]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #20]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vld1.u8         {q2}, [r0], r1
+    vqrshrn.u16    d23, q7, #7
+    vld1.u8         {q3}, [r0], r1
+    vqrshrn.u16    d24, q8, #7
+    vld1.u8         {q4}, [r0], r1
+    vqrshrn.u16    d25, q9, #7
+
+    ;first_pass filtering on the rest 5-line data
+    vld1.u8         {q5}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d27, q7, #7
+    vqrshrn.u16    d28, q8, #7
+    vqrshrn.u16    d29, q9, #7
+    vqrshrn.u16    d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    ;skip_secondpass_filter
+    beq             sub_pixel_variance8x8_neon
+
+    add             r3, r12, r3, lsl #3
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+    vmlal.u8        q5, d27, d1
+    vmlal.u8        q6, d28, d1
+    vmlal.u8        q7, d29, d1
+    vmlal.u8        q8, d30, d1
+
+    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d23, q2, #7
+    vqrshrn.u16    d24, q3, #7
+    vqrshrn.u16    d25, q4, #7
+    vqrshrn.u16    d26, q5, #7
+    vqrshrn.u16    d27, q6, #7
+    vqrshrn.u16    d28, q7, #7
+    vqrshrn.u16    d29, q8, #7
+
+    b               sub_pixel_variance8x8_neon
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+    vld1.u8         {d27}, [r0], r1
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+    b               secondpass_filter
+
+;----------------------
+;vp8_variance8x8_neon
+sub_pixel_variance8x8_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #2
+
+sub_pixel_variance8x8_neon_loop
+    vld1.8          {d0}, [r4], r5              ;load dst data
+    subs            r12, r12, #1
+    vld1.8          {d1}, [r4], r5
+    vld1.8          {d2}, [r4], r5
+    vsubl.u8        q4, d22, d0                 ;calculate diff
+    vld1.8          {d3}, [r4], r5
+
+    vsubl.u8        q5, d23, d1
+    vsubl.u8        q6, d24, d2
+
+    vpadal.s16      q8, q4                      ;sum
+    vmlal.s16       q9, d8, d8                  ;sse
+    vmlal.s16       q10, d9, d9
+
+    vsubl.u8        q7, d25, d3
+
+    vpadal.s16      q8, q5
+    vmlal.s16       q9, d10, d10
+    vmlal.s16       q10, d11, d11
+
+    vmov            q11, q13
+
+    vpadal.s16      q8, q6
+    vmlal.s16       q9, d12, d12
+    vmlal.s16       q10, d13, d13
+
+    vmov            q12, q14
+
+    vpadal.s16      q8, q7
+    vmlal.s16       q9, d14, d14
+    vmlal.s16       q10, d15, d15
+
+    bne             sub_pixel_variance8x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #6
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {r4-r5, pc}
+
+    ENDP
+
+;-----------------
+
+_BilinearTaps_coeff_
+    DCD     bilinear_taps_coeff
+bilinear_taps_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/picklpf_arm.c
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/alloccommon.h"
+
+extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+
+
+void
+vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction)
+{
+    unsigned char *src_y, *dst_y;
+    int yheight;
+    int ystride;
+    int border;
+    int yoffset;
+    int linestocopy;
+
+    border   = src_ybc->border;
+    yheight  = src_ybc->y_height;
+    ystride  = src_ybc->y_stride;
+
+    linestocopy = (yheight >> (Fraction + 4));
+
+    if (linestocopy < 1)
+        linestocopy = 1;
+
+    linestocopy <<= 4;
+
+    yoffset  = ystride * ((yheight >> 5) * 16 - 8);
+    src_y = src_ybc->y_buffer + yoffset;
+    dst_y = dst_ybc->y_buffer + yoffset;
+
+    //vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
+    vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride *(linestocopy + 16)));
+}
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/quantize_arm.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef QUANTIZE_ARM_H
+#define QUANTIZE_ARM_H
+
+#if HAVE_ARMV6
+
+extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
+
+#if HAVE_ARMV7
+
+extern prototype_quantize_block(vp8_fast_quantize_b_neon);
+extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
+
+#undef  vp8_quantize_fastquantb_pair
+#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon
+
+#undef vp8_quantize_mb
+#define vp8_quantize_mb vp8_quantize_mb_neon
+
+#undef vp8_quantize_mbuv
+#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
+
+#undef vp8_quantize_mby
+#define vp8_quantize_mby vp8_quantize_mby_neon
+#endif
+
+#endif /* HAVE_ARMV7 */
+
+#endif
+
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/variance_arm.c
@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/encoder/variance.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/arm/bilinearfilter_arm.h"
+
+#if HAVE_ARMV6
+
+unsigned int vp8_sub_pixel_variance8x8_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[10*8];
+    unsigned char  second_pass[8*8];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            9, 8, HFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             8, 8, 8, VFilter);
+
+    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
+                                   dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance16x16_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[36*16];
+    unsigned char  second_pass[20*16];
+    const short *HFilter, *VFilter;
+    unsigned int var;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else
+    {
+        HFilter = vp8_bilinear_filters[xoffset];
+        VFilter = vp8_bilinear_filters[yoffset];
+
+        vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                                src_pixels_per_line,
+                                                17, 16, HFilter);
+        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                                 16, 16, 16, VFilter);
+
+        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+                                       dst_pixels_per_line, sse);
+    }
+    return var;
+}
+
+#endif /* HAVE_ARMV6 */
+
+
+#if HAVE_ARMV7
+
+unsigned int vp8_sub_pixel_variance16x16_neon
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+  if (xoffset == 4 && yoffset == 0)
+    return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == 0 && yoffset == 4)
+    return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == 4 && yoffset == 4)
+    return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else
+    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/arm/variance_arm.h
@@ -0,0 +1,155 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_ARM_H
+#define VARIANCE_ARM_H
+
+#if HAVE_ARMV6
+
+extern prototype_sad(vp8_sad16x16_armv6);
+extern prototype_variance(vp8_variance16x16_armv6);
+extern prototype_variance(vp8_variance8x8_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_armv6);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_armv6);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_armv6);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6);
+extern prototype_variance(vp8_mse16x16_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_armv6
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
+
+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_armv6
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_armv6
+
+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_armv6
+
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_armv6
+
+#undef  vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6
+
+#undef  vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_armv6
+
+#undef  vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_armv6
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
+
+
+#if HAVE_ARMV7
+extern prototype_sad(vp8_sad4x4_neon);
+extern prototype_sad(vp8_sad8x8_neon);
+extern prototype_sad(vp8_sad8x16_neon);
+extern prototype_sad(vp8_sad16x8_neon);
+extern prototype_sad(vp8_sad16x16_neon);
+
+//extern prototype_variance(vp8_variance4x4_c);
+extern prototype_variance(vp8_variance8x8_neon);
+extern prototype_variance(vp8_variance8x16_neon);
+extern prototype_variance(vp8_variance16x8_neon);
+extern prototype_variance(vp8_variance16x16_neon);
+
+//extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_c);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon);
+//extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c);
+//extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon_func);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_neon);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_neon);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon);
+
+//extern prototype_getmbss(vp8_get_mb_ss_c);
+extern prototype_variance(vp8_mse16x16_neon);
+extern prototype_get16x16prederror(vp8_get4x4sse_cs_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_neon
+
+#undef  vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_neon
+
+#undef  vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_neon
+
+#undef  vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_neon
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_neon
+
+//#undef  vp8_variance_var4x4
+//#define vp8_variance_var4x4 vp8_variance4x4_c
+
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_neon
+
+#undef  vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_neon
+
+#undef  vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_neon
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_neon
+
+//#undef  vp8_variance_subpixvar4x4
+//#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_c
+
+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_neon
+
+//#undef  vp8_variance_subpixvar8x16
+//#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_c
+
+//#undef  vp8_variance_subpixvar16x8
+//#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_c
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_neon
+
+#undef  vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_neon
+
+#undef  vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_neon
+
+#undef  vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_neon
+
+//#undef  vp8_variance_getmbss
+//#define vp8_variance_getmbss vp8_get_mb_ss_c
+
+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_neon
+
+#undef  vp8_variance_get4x4sse_cs
+#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon
+#endif
+
+#endif
+
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/asm_enc_offsets.c
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_config.h"
+#include "block.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "tokenize.h"
+
+BEGIN
+
+/* regular quantize */
+DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));
+DEFINE(vp8_block_zbin,                          offsetof(BLOCK, zbin));
+DEFINE(vp8_block_round,                         offsetof(BLOCK, round));
+DEFINE(vp8_block_quant,                         offsetof(BLOCK, quant));
+DEFINE(vp8_block_quant_fast,                    offsetof(BLOCK, quant_fast));
+DEFINE(vp8_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
+DEFINE(vp8_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
+DEFINE(vp8_block_quant_shift,                   offsetof(BLOCK, quant_shift));
+
+DEFINE(vp8_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
+DEFINE(vp8_blockd_dequant,                      offsetof(BLOCKD, dequant));
+DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
+DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));
+
+/* subtract */
+DEFINE(vp8_block_base_src,                      offsetof(BLOCK, base_src));
+DEFINE(vp8_block_src,                           offsetof(BLOCK, src));
+DEFINE(vp8_block_src_diff,                      offsetof(BLOCK, src_diff));
+DEFINE(vp8_block_src_stride,                    offsetof(BLOCK, src_stride));
+
+DEFINE(vp8_blockd_predictor,                    offsetof(BLOCKD, predictor));
+
+/* pack tokens */
+DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
+DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
+DEFINE(vp8_writer_value,                        offsetof(vp8_writer, value));
+DEFINE(vp8_writer_count,                        offsetof(vp8_writer, count));
+DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
+DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
+
+DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
+DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
+DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
+DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));
+
+DEFINE(vp8_extra_bit_struct_sz,                 sizeof(vp8_extra_bit_struct));
+
+DEFINE(vp8_token_value,                         offsetof(vp8_token, value));
+DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));
+
+DEFINE(vp8_extra_bit_struct_tree,               offsetof(vp8_extra_bit_struct, tree));
+DEFINE(vp8_extra_bit_struct_prob,               offsetof(vp8_extra_bit_struct, prob));
+DEFINE(vp8_extra_bit_struct_len,                offsetof(vp8_extra_bit_struct, Len));
+DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, base_val));
+
+DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
+DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
+DEFINE(vp8_comp_bc2,                            offsetof(VP8_COMP, bc2));
+
+DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
+DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
+DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
+
+DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code
+ * add asserts for any size that is not supported by assembly code
+
+ * These are used in vp8cx_pack_tokens.  They are hard coded so if their sizes
+ * change they will have to be adjusted.
+ */
+
+#if HAVE_ARMV5TE
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
+ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
+#endif
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/encoder/bitstream.c
@@ -0,0 +1,1913 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/header.h"
+#include "encodemv.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/findnearmv.h"
+#include "mcomp.h"
+#include "vp8/common/systemdependent.h"
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+#include "vp8/common/pragmas.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
+#include "bitstream.h"
+#include "vp8/common/defaultcoefcounts.h"
+
+const int vp8cx_base_skip_false_prob[128] =
+{
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    251, 248, 244, 240, 236, 232, 229, 225,
+    221, 217, 213, 208, 204, 199, 194, 190,
+    187, 183, 179, 175, 172, 168, 164, 160,
+    157, 153, 149, 145, 142, 138, 134, 130,
+    127, 124, 120, 117, 114, 110, 107, 104,
+    101, 98,  95,  92,  89,  86,  83, 80,
+    77,  74,  71,  68,  65,  62,  59, 56,
+    53,  50,  47,  44,  41,  38,  35, 32,
+    30,  28,  26,  24,  22,  20,  18, 16,
+};
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+#endif
+
+#ifdef ENTROPY_STATS
+int intra_mode_stats[10][10][10];
+static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] [2];
+extern unsigned int active_section;
+#endif
+
+#ifdef MODE_STATS
+int count_mb_seg[4] = { 0, 0, 0, 0 };
+#endif
+
+
+static void update_mode(
+    vp8_writer *const w,
+    int n,
+    vp8_token tok               [/* n */],
+    vp8_tree tree,
+    vp8_prob Pnew               [/* n-1 */],
+    vp8_prob Pcur               [/* n-1 */],
+    unsigned int bct            [/* n-1 */] [2],
+    const unsigned int num_events[/* n */]
+)
+{
+    unsigned int new_b = 0, old_b = 0;
+    int i = 0;
+
+    vp8_tree_probs_from_distribution(
+        n--, tok, tree,
+        Pnew, bct, num_events,
+        256, 1
+    );
+
+    do
+    {
+        new_b += vp8_cost_branch(bct[i], Pnew[i]);
+        old_b += vp8_cost_branch(bct[i], Pcur[i]);
+    }
+    while (++i < n);
+
+    if (new_b + (n << 8) < old_b)
+    {
+        int i = 0;
+
+        vp8_write_bit(w, 1);
+
+        do
+        {
+            const vp8_prob p = Pnew[i];
+
+            vp8_write_literal(w, Pcur[i] = p ? p : 1, 8);
+        }
+        while (++i < n);
+    }
+    else
+        vp8_write_bit(w, 0);
+}
+
+static void update_mbintra_mode_probs(VP8_COMP *cpi)
+{
+    VP8_COMMON *const x = & cpi->common;
+
+    vp8_writer *const w = & cpi->bc;
+
+    {
+        vp8_prob Pnew   [VP8_YMODES-1];
+        unsigned int bct [VP8_YMODES-1] [2];
+
+        update_mode(
+            w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
+            Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
+        );
+    }
+    {
+        vp8_prob Pnew   [VP8_UV_MODES-1];
+        unsigned int bct [VP8_UV_MODES-1] [2];
+
+        update_mode(
+            w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
+            Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->uv_mode_count
+        );
+    }
+}
+
+static void write_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_ymode_tree, p, vp8_ymode_encodings + m);
+}
+
+static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
+}
+
+static void write_uv_mode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_uv_mode_encodings + m);
+}
+
+
+static void write_bmode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_bmode_tree, p, vp8_bmode_encodings + m);
+}
+
+static void write_split(vp8_writer *bc, int x)
+{
+    vp8_write_token(
+        bc, vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + x
+    );
+}
+
+static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+{
+    const TOKENEXTRA *const stop = p + xcount;
+    unsigned int split;
+    unsigned int shift;
+    int count = w->count;
+    unsigned int range = w->range;
+    unsigned int lowvalue = w->lowvalue;
+
+    while (p < stop)
+    {
+        const int t = p->Token;
+        vp8_token *const a = vp8_coef_encodings + t;
+        const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
+        int i = 0;
+        const unsigned char *pp = p->context_tree;
+        int v = a->value;
+        int n = a->Len;
+
+        if (p->skip_eob_node)
+        {
+            n--;
+            i = 2;
+        }
+
+        do
+        {
+            const int bb = (v >> --n) & 1;
+            split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+            i = vp8_coef_tree[i+bb];
+
+            if (bb)
+            {
+                lowvalue += split;
+                range = range - split;
+            }
+            else
+            {
+                range = split;
+            }
+
+            shift = vp8_norm[range];
+            range <<= shift;
+            count += shift;
+
+            if (count >= 0)
+            {
+                int offset = shift - count;
+
+                if ((lowvalue << (offset - 1)) & 0x80000000)
+                {
+                    int x = w->pos - 1;
+
+                    while (x >= 0 && w->buffer[x] == 0xff)
+                    {
+                        w->buffer[x] = (unsigned char)0;
+                        x--;
+                    }
+
+                    w->buffer[x] += 1;
+                }
+
+                w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                lowvalue <<= offset;
+                shift = count;
+                lowvalue &= 0xffffff;
+                count -= 8 ;
+            }
+
+            lowvalue <<= shift;
+        }
+        while (n);
+
+
+        if (b->base_val)
+        {
+            const int e = p->Extra, L = b->Len;
+
+            if (L)
+            {
+                const unsigned char *pp = b->prob;
+                int v = e >> 1;
+                int n = L;              /* number of bits in v, assumed nonzero */
+                int i = 0;
+
+                do
+                {
+                    const int bb = (v >> --n) & 1;
+                    split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                    i = b->tree[i+bb];
+
+                    if (bb)
+                    {
+                        lowvalue += split;
+                        range = range - split;
+                    }
+                    else
+                    {
+                        range = split;
+                    }
+
+                    shift = vp8_norm[range];
+                    range <<= shift;
+                    count += shift;
+
+                    if (count >= 0)
+                    {
+                        int offset = shift - count;
+
+                        if ((lowvalue << (offset - 1)) & 0x80000000)
+                        {
+                            int x = w->pos - 1;
+
+                            while (x >= 0 && w->buffer[x] == 0xff)
+                            {
+                                w->buffer[x] = (unsigned char)0;
+                                x--;
+                            }
+
+                            w->buffer[x] += 1;
+                        }
+
+                        w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                        lowvalue <<= offset;
+                        shift = count;
+                        lowvalue &= 0xffffff;
+                        count -= 8 ;
+                    }
+
+                    lowvalue <<= shift;
+                }
+                while (n);
+            }
+
+
+            {
+
+                split = (range + 1) >> 1;
+
+                if (e & 1)
+                {
+                    lowvalue += split;
+                    range = range - split;
+                }
+                else
+                {
+                    range = split;
+                }
+
+                range <<= 1;
+
+                if ((lowvalue & 0x80000000))
+                {
+                    int x = w->pos - 1;
+
+                    while (x >= 0 && w->buffer[x] == 0xff)
+                    {
+                        w->buffer[x] = (unsigned char)0;
+                        x--;
+                    }
+
+                    w->buffer[x] += 1;
+
+                }
+
+                lowvalue  <<= 1;
+
+                if (!++count)
+                {
+                    count = -8;
+                    w->buffer[w->pos++] = (lowvalue >> 24);
+                    lowvalue &= 0xffffff;
+                }
+            }
+
+        }
+
+        ++p;
+    }
+
+    w->count = count;
+    w->lowvalue = lowvalue;
+    w->range = range;
+
+}
+
+static void write_partition_size(unsigned char *cx_data, int size)
+{
+    signed char csize;
+
+    csize = size & 0xff;
+    *cx_data = csize;
+    csize = (size >> 8) & 0xff;
+    *(cx_data + 1) = csize;
+    csize = (size >> 16) & 0xff;
+    *(cx_data + 2) = csize;
+
+}
+
+static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, int num_part, int *size)
+{
+
+    int i;
+    unsigned char *ptr = cx_data;
+    unsigned int shift;
+    vp8_writer *w = &cpi->bc2;
+    *size = 3 * (num_part - 1);
+    cpi->partition_sz[0] += *size;
+    ptr = cx_data + (*size);
+
+    for (i = 0; i < num_part; i++)
+    {
+        vp8_start_encode(w, ptr);
+        {
+            unsigned int split;
+            int count = w->count;
+            unsigned int range = w->range;
+            unsigned int lowvalue = w->lowvalue;
+            int mb_row;
+
+            for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part)
+            {
+                TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+                TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+
+                while (p < stop)
+                {
+                    const int t = p->Token;
+                    vp8_token *const a = vp8_coef_encodings + t;
+                    const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
+                    int i = 0;
+                    const unsigned char *pp = p->context_tree;
+                    int v = a->value;
+                    int n = a->Len;
+
+                    if (p->skip_eob_node)
+                    {
+                        n--;
+                        i = 2;
+                    }
+
+                    do
+                    {
+                        const int bb = (v >> --n) & 1;
+                        split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                        i = vp8_coef_tree[i+bb];
+
+                        if (bb)
+                        {
+                            lowvalue += split;
+                            range = range - split;
+                        }
+                        else
+                        {
+                            range = split;
+                        }
+
+                        shift = vp8_norm[range];
+                        range <<= shift;
+                        count += shift;
+
+                        if (count >= 0)
+                        {
+                            int offset = shift - count;
+
+                            if ((lowvalue << (offset - 1)) & 0x80000000)
+                            {
+                                int x = w->pos - 1;
+
+                                while (x >= 0 && w->buffer[x] == 0xff)
+                                {
+                                    w->buffer[x] = (unsigned char)0;
+                                    x--;
+                                }
+
+                                w->buffer[x] += 1;
+                            }
+
+                            w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                            lowvalue <<= offset;
+                            shift = count;
+                            lowvalue &= 0xffffff;
+                            count -= 8 ;
+                        }
+
+                        lowvalue <<= shift;
+                    }
+                    while (n);
+
+
+                    if (b->base_val)
+                    {
+                        const int e = p->Extra, L = b->Len;
+
+                        if (L)
+                        {
+                            const unsigned char *pp = b->prob;
+                            int v = e >> 1;
+                            int n = L;              /* number of bits in v, assumed nonzero */
+                            int i = 0;
+
+                            do
+                            {
+                                const int bb = (v >> --n) & 1;
+                                split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                                i = b->tree[i+bb];
+
+                                if (bb)
+                                {
+                                    lowvalue += split;
+                                    range = range - split;
+                                }
+                                else
+                                {
+                                    range = split;
+                                }
+
+                                shift = vp8_norm[range];
+                                range <<= shift;
+                                count += shift;
+
+                                if (count >= 0)
+                                {
+                                    int offset = shift - count;
+
+                                    if ((lowvalue << (offset - 1)) & 0x80000000)
+                                    {
+                                        int x = w->pos - 1;
+
+                                        while (x >= 0 && w->buffer[x] == 0xff)
+                                        {
+                                            w->buffer[x] = (unsigned char)0;
+                                            x--;
+                                        }
+
+                                        w->buffer[x] += 1;
+                                    }
+
+                                    w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                                    lowvalue <<= offset;
+                                    shift = count;
+                                    lowvalue &= 0xffffff;
+                                    count -= 8 ;
+                                }
+
+                                lowvalue <<= shift;
+                            }
+                            while (n);
+                        }
+
+                        {
+                            split = (range + 1) >> 1;
+
+                            if (e & 1)
+                            {
+                                lowvalue += split;
+                                range = range - split;
+                            }
+                            else
+                            {
+                                range = split;
+                            }
+
+                            range <<= 1;
+
+                            if ((lowvalue & 0x80000000))
+                            {
+                                int x = w->pos - 1;
+
+                                while (x >= 0 && w->buffer[x] == 0xff)
+                                {
+                                    w->buffer[x] = (unsigned char)0;
+                                    x--;
+                                }
+
+                                w->buffer[x] += 1;
+
+                            }
+
+                            lowvalue  <<= 1;
+
+                            if (!++count)
+                            {
+                                count = -8;
+                                w->buffer[w->pos++] = (lowvalue >> 24);
+                                lowvalue &= 0xffffff;
+                            }
+                        }
+
+                    }
+
+                    ++p;
+                }
+            }
+
+            w->count    = count;
+            w->lowvalue = lowvalue;
+            w->range    = range;
+
+        }
+
+        vp8_stop_encode(w);
+        *size +=   w->pos;
+
+        /* The first partition size is set earlier */
+        cpi->partition_sz[i + 1] = w->pos;
+
+        if (i < (num_part - 1))
+        {
+            write_partition_size(cx_data, w->pos);
+            cx_data += 3;
+            ptr += w->pos;
+        }
+    }
+}
+
+
+static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
+{
+
+    unsigned int split;
+    int count = w->count;
+    unsigned int range = w->range;
+    unsigned int lowvalue = w->lowvalue;
+    unsigned int shift;
+    int mb_row;
+
+    for (mb_row = 0; mb_row < cpi->common.mb_rows; mb_row++)
+    {
+        TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+        TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+
+        while (p < stop)
+        {
+            const int t = p->Token;
+            vp8_token *const a = vp8_coef_encodings + t;
+            const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
+            int i = 0;
+            const unsigned char *pp = p->context_tree;
+            int v = a->value;
+            int n = a->Len;
+
+            if (p->skip_eob_node)
+            {
+                n--;
+                i = 2;
+            }
+
+            do
+            {
+                const int bb = (v >> --n) & 1;
+                split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                i = vp8_coef_tree[i+bb];
+
+                if (bb)
+                {
+                    lowvalue += split;
+                    range = range - split;
+                }
+                else
+                {
+                    range = split;
+                }
+
+                shift = vp8_norm[range];
+                range <<= shift;
+                count += shift;
+
+                if (count >= 0)
+                {
+                    int offset = shift - count;
+
+                    if ((lowvalue << (offset - 1)) & 0x80000000)
+                    {
+                        int x = w->pos - 1;
+
+                        while (x >= 0 && w->buffer[x] == 0xff)
+                        {
+                            w->buffer[x] = (unsigned char)0;
+                            x--;
+                        }
+
+                        w->buffer[x] += 1;
+                    }
+
+                    w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                    lowvalue <<= offset;
+                    shift = count;
+                    lowvalue &= 0xffffff;
+                    count -= 8 ;
+                }
+
+                lowvalue <<= shift;
+            }
+            while (n);
+
+
+            if (b->base_val)
+            {
+                const int e = p->Extra, L = b->Len;
+
+                if (L)
+                {
+                    const unsigned char *pp = b->prob;
+                    int v = e >> 1;
+                    int n = L;              /* number of bits in v, assumed nonzero */
+                    int i = 0;
+
+                    do
+                    {
+                        const int bb = (v >> --n) & 1;
+                        split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                        i = b->tree[i+bb];
+
+                        if (bb)
+                        {
+                            lowvalue += split;
+                            range = range - split;
+                        }
+                        else
+                        {
+                            range = split;
+                        }
+
+                        shift = vp8_norm[range];
+                        range <<= shift;
+                        count += shift;
+
+                        if (count >= 0)
+                        {
+                            int offset = shift - count;
+
+                            if ((lowvalue << (offset - 1)) & 0x80000000)
+                            {
+                                int x = w->pos - 1;
+
+                                while (x >= 0 && w->buffer[x] == 0xff)
+                                {
+                                    w->buffer[x] = (unsigned char)0;
+                                    x--;
+                                }
+
+                                w->buffer[x] += 1;
+                            }
+
+                            w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                            lowvalue <<= offset;
+                            shift = count;
+                            lowvalue &= 0xffffff;
+                            count -= 8 ;
+                        }
+
+                        lowvalue <<= shift;
+                    }
+                    while (n);
+                }
+
+                {
+                    split = (range + 1) >> 1;
+
+                    if (e & 1)
+                    {
+                        lowvalue += split;
+                        range = range - split;
+                    }
+                    else
+                    {
+                        range = split;
+                    }
+
+                    range <<= 1;
+
+                    if ((lowvalue & 0x80000000))
+                    {
+                        int x = w->pos - 1;
+
+                        while (x >= 0 && w->buffer[x] == 0xff)
+                        {
+                            w->buffer[x] = (unsigned char)0;
+                            x--;
+                        }
+
+                        w->buffer[x] += 1;
+
+                    }
+
+                    lowvalue  <<= 1;
+
+                    if (!++count)
+                    {
+                        count = -8;
+                        w->buffer[w->pos++] = (lowvalue >> 24);
+                        lowvalue &= 0xffffff;
+                    }
+                }
+
+            }
+
+            ++p;
+        }
+    }
+
+    w->count = count;
+    w->lowvalue = lowvalue;
+    w->range = range;
+
+}
+
+static void write_mv_ref
+(
+    vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+#if CONFIG_DEBUG
+    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+#endif
+    vp8_write_token(w, vp8_mv_ref_tree, p,
+                    vp8_mv_ref_encoding_array - NEARESTMV + m);
+}
+
+static void write_sub_mv_ref
+(
+    vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+#if CONFIG_DEBUG
+    assert(LEFT4X4 <= m  &&  m <= NEW4X4);
+#endif
+    vp8_write_token(w, vp8_sub_mv_ref_tree, p,
+                    vp8_sub_mv_ref_encoding_array - LEFT4X4 + m);
+}
+
+static void write_mv
+(
+    vp8_writer *w, const MV *mv, const int_mv *ref, const MV_CONTEXT *mvc
+)
+{
+    MV e;
+    e.row = mv->row - ref->as_mv.row;
+    e.col = mv->col - ref->as_mv.col;
+
+    vp8_encode_motion_vector(w, &e, mvc);
+}
+
+static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACROBLOCKD *x)
+{
+    // Encode the MB segment id.
+    if (x->segmentation_enabled && x->update_mb_segmentation_map)
+    {
+        switch (mi->segment_id)
+        {
+        case 0:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+            break;
+        case 1:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 1, x->mb_segment_tree_probs[1]);
+            break;
+        case 2:
+            vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[2]);
+            break;
+        case 3:
+            vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 1, x->mb_segment_tree_probs[2]);
+            break;
+
+            // TRAP.. This should not happen
+        default:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+            break;
+        }
+    }
+}
+
+
+static void pack_inter_mode_mvs(VP8_COMP *const cpi)
+{
+    VP8_COMMON *const pc = & cpi->common;
+    vp8_writer *const w = & cpi->bc;
+    const MV_CONTEXT *mvc = pc->fc.mvc;
+
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+    MODE_INFO *m = pc->mi, *ms;
+    const int mis = pc->mode_info_stride;
+    int mb_row = -1;
+
+    int prob_last_coded;
+    int prob_gf_coded;
+    int prob_skip_false = 0;
+    ms = pc->mi - 1;
+
+    cpi->mb.partition_info = cpi->mb.pi;
+
+    // Calculate the probabilities to be used to code the reference frame based on actual useage this frame
+    if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
+        cpi->prob_intra_coded = 1;
+
+    prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+    if (!prob_last_coded)
+        prob_last_coded = 1;
+
+    prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                    ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+    if (!prob_gf_coded)
+        prob_gf_coded = 1;
+
+
+#ifdef ENTROPY_STATS
+    active_section = 1;
+#endif
+
+    if (pc->mb_no_coeff_skip)
+    {
+        prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count);
+
+        if (prob_skip_false <= 1)
+            prob_skip_false = 1;
+
+        if (prob_skip_false > 255)
+            prob_skip_false = 255;
+
+        cpi->prob_skip_false = prob_skip_false;
+        vp8_write_literal(w, prob_skip_false, 8);
+    }
+
+    vp8_write_literal(w, cpi->prob_intra_coded, 8);
+    vp8_write_literal(w, prob_last_coded, 8);
+    vp8_write_literal(w, prob_gf_coded, 8);
+
+    update_mbintra_mode_probs(cpi);
+
+    vp8_write_mvprobs(cpi);
+
+    while (++mb_row < pc->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < pc->mb_cols)
+        {
+            const MB_MODE_INFO *const mi = & m->mbmi;
+            const MV_REFERENCE_FRAME rf = mi->ref_frame;
+            const MB_PREDICTION_MODE mode = mi->mode;
+
+            MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+            // Distance of Mb to the various image edges.
+            // These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+            xd->mb_to_left_edge = -((mb_col * 16) << 3);
+            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+            xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+#ifdef ENTROPY_STATS
+            active_section = 9;
+#endif
+
+            if (cpi->mb.e_mbd.update_mb_segmentation_map)
+                write_mb_features(w, mi, &cpi->mb.e_mbd);
+
+            if (pc->mb_no_coeff_skip)
+                vp8_encode_bool(w, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+            if (rf == INTRA_FRAME)
+            {
+                vp8_write(w, 0, cpi->prob_intra_coded);
+#ifdef ENTROPY_STATS
+                active_section = 6;
+#endif
+                write_ymode(w, mode, pc->fc.ymode_prob);
+
+                if (mode == B_PRED)
+                {
+                    int j = 0;
+
+                    do
+                        write_bmode(w, m->bmi[j].as_mode, pc->fc.bmode_prob);
+                    while (++j < 16);
+                }
+
+                write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
+            }
+            else    /* inter coded */
+            {
+                int_mv best_mv;
+                vp8_prob mv_ref_p [VP8_MVREFS-1];
+
+                vp8_write(w, 1, cpi->prob_intra_coded);
+
+                if (rf == LAST_FRAME)
+                    vp8_write(w, 0, prob_last_coded);
+                else
+                {
+                    vp8_write(w, 1, prob_last_coded);
+                    vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, prob_gf_coded);
+                }
+
+                {
+                    int_mv n1, n2;
+                    int ct[4];
+
+                    vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, cpi->common.ref_frame_sign_bias);
+                    vp8_mv_ref_probs(mv_ref_p, ct);
+
+#ifdef ENTROPY_STATS
+                    accum_mv_refs(mode, ct);
+#endif
+
+                }
+
+#ifdef ENTROPY_STATS
+                active_section = 3;
+#endif
+
+                write_mv_ref(w, mode, mv_ref_p);
+
+                switch (mode)   /* new, split require MVs */
+                {
+                case NEWMV:
+
+#ifdef ENTROPY_STATS
+                    active_section = 5;
+#endif
+
+                    write_mv(w, &mi->mv.as_mv, &best_mv, mvc);
+                    break;
+
+                case SPLITMV:
+                {
+                    int j = 0;
+
+#ifdef MODE_STATS
+                    ++count_mb_seg [mi->partitioning];
+#endif
+
+                    write_split(w, mi->partitioning);
+
+                    do
+                    {
+                        B_PREDICTION_MODE blockmode;
+                        int_mv blockmv;
+                        const int *const  L = vp8_mbsplits [mi->partitioning];
+                        int k = -1;  /* first block in subset j */
+                        int mv_contz;
+                        int_mv leftmv, abovemv;
+
+                        blockmode =  cpi->mb.partition_info->bmi[j].mode;
+                        blockmv =  cpi->mb.partition_info->bmi[j].mv;
+#if CONFIG_DEBUG
+                        while (j != L[++k])
+                            if (k >= 16)
+                                assert(0);
+#else
+                        while (j != L[++k]);
+#endif
+                        leftmv.as_int = left_block_mv(m, k);
+                        abovemv.as_int = above_block_mv(m, k, mis);
+                        mv_contz = vp8_mv_cont(&leftmv, &abovemv);
+
+                        write_sub_mv_ref(w, blockmode, vp8_sub_mv_ref_prob2 [mv_contz]);
+
+                        if (blockmode == NEW4X4)
+                        {
+#ifdef ENTROPY_STATS
+                            active_section = 11;
+#endif
+                            write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *) mvc);
+                        }
+                    }
+                    while (++j < cpi->mb.partition_info->count);
+                }
+                break;
+                default:
+                    break;
+                }
+            }
+
+            ++m;
+            cpi->mb.partition_info++;
+        }
+
+        ++m;  /* skip L prediction border */
+        cpi->mb.partition_info++;
+    }
+}
+
+
+static void write_kfmodes(VP8_COMP *cpi)
+{
+    vp8_writer *const bc = & cpi->bc;
+    const VP8_COMMON *const c = & cpi->common;
+    /* const */
+    MODE_INFO *m = c->mi;
+
+    int mb_row = -1;
+    int prob_skip_false = 0;
+
+    if (c->mb_no_coeff_skip)
+    {
+        prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count);
+
+        if (prob_skip_false <= 1)
+            prob_skip_false = 1;
+
+        if (prob_skip_false >= 255)
+            prob_skip_false = 255;
+
+        cpi->prob_skip_false = prob_skip_false;
+        vp8_write_literal(bc, prob_skip_false, 8);
+    }
+
+    while (++mb_row < c->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < c->mb_cols)
+        {
+            const int ym = m->mbmi.mode;
+
+            if (cpi->mb.e_mbd.update_mb_segmentation_map)
+                write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd);
+
+            if (c->mb_no_coeff_skip)
+                vp8_encode_bool(bc, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+            kfwrite_ymode(bc, ym, c->kf_ymode_prob);
+
+            if (ym == B_PRED)
+            {
+                const int mis = c->mode_info_stride;
+                int i = 0;
+
+                do
+                {
+                    const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+                    const B_PREDICTION_MODE L = left_block_mode(m, i);
+                    const int bm = m->bmi[i].as_mode;
+
+#ifdef ENTROPY_STATS
+                    ++intra_mode_stats [A] [L] [bm];
+#endif
+
+                    write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);
+                }
+                while (++i < 16);
+            }
+
+            write_uv_mode(bc, (m++)->mbmi.uv_mode, c->kf_uv_mode_prob);
+        }
+
+        m++;    // skip L prediction border
+    }
+}
+
+/* This function is used for debugging probability trees. */
+static void print_prob_tree(vp8_prob
+     coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES])
+{
+    /* print coef probability tree */
+    int i,j,k,l;
+    FILE* f = fopen("enc_tree_probs.txt", "a");
+    fprintf(f, "{\n");
+    for (i = 0; i < BLOCK_TYPES; i++)
+    {
+        fprintf(f, "  {\n");
+        for (j = 0; j < COEF_BANDS; j++)
+        {
+            fprintf(f, "    {\n");
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+            {
+                fprintf(f, "      {");
+                for (l = 0; l < ENTROPY_NODES; l++)
+                {
+                    fprintf(f, "%3u, ",
+                            (unsigned int)(coef_probs [i][j][k][l]));
+                }
+                fprintf(f, " }\n");
+            }
+            fprintf(f, "    }\n");
+        }
+        fprintf(f, "  }\n");
+    }
+    fprintf(f, "}\n");
+    fclose(f);
+}
+
+static void sum_probs_over_prev_coef_context(
+        const unsigned int probs[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+        unsigned int* out)
+{
+    int i, j;
+    for (i=0; i < MAX_ENTROPY_TOKENS; ++i)
+    {
+        for (j=0; j < PREV_COEF_CONTEXTS; ++j)
+        {
+            const int tmp = out[i];
+            out[i] += probs[j][i];
+            /* check for wrap */
+            if (out[i] < tmp)
+                out[i] = UINT_MAX;
+        }
+    }
+}
+
+static int prob_update_savings(const unsigned int *ct,
+                                   const vp8_prob oldp, const vp8_prob newp,
+                                   const vp8_prob upd)
+{
+    const int old_b = vp8_cost_branch(ct, oldp);
+    const int new_b = vp8_cost_branch(ct, newp);
+    const int update_b = 8 +
+                         ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+    return old_b - new_b - update_b;
+}
+
+static int independent_coef_context_savings(VP8_COMP *cpi)
+{
+    int savings = 0;
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            unsigned int prev_coef_count_sum[MAX_ENTROPY_TOKENS] = {0};
+            int prev_coef_savings[MAX_ENTROPY_TOKENS] = {0};
+            /* Calculate new probabilities given the constraint that
+             * they must be equal over the prev coef contexts
+             */
+            if (cpi->common.frame_type == KEY_FRAME)
+            {
+                /* Reset to default probabilities at key frames */
+                sum_probs_over_prev_coef_context(vp8_default_coef_counts[i][j],
+                                                 prev_coef_count_sum);
+            }
+            else
+            {
+                sum_probs_over_prev_coef_context(cpi->coef_counts[i][j],
+                                                 prev_coef_count_sum);
+            }
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                //vp8_prob new_p           [ENTROPY_NODES];
+                //unsigned int branch_ct   [ENTROPY_NODES] [2];
+
+                int t = 0;      /* token/prob index */
+
+                vp8_tree_probs_from_distribution(
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    cpi->frame_coef_probs[i][j][k],
+                    cpi->frame_branch_ct [i][j][k],
+                    prev_coef_count_sum,
+                    256, 1);
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+                    const vp8_prob oldp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+                    const int s = prob_update_savings(ct, oldp, newp, upd);
+
+                    if (cpi->common.frame_type != KEY_FRAME ||
+                        (cpi->common.frame_type == KEY_FRAME && newp != oldp))
+                        prev_coef_savings[t] += s;
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+            k = 0;
+            do
+            {
+                /* We only update probabilities if we can save bits, except
+                 * for key frames where we have to update all probabilities
+                 * to get the equal probabilities across the prev coef
+                 * contexts.
+                 */
+                if (prev_coef_savings[k] > 0 ||
+                    cpi->common.frame_type == KEY_FRAME)
+                    savings += prev_coef_savings[k];
+            }
+            while (++k < ENTROPY_NODES);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+    return savings;
+}
+
+static int default_coef_context_savings(VP8_COMP *cpi)
+{
+    int savings = 0;
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                //vp8_prob new_p           [ENTROPY_NODES];
+                //unsigned int branch_ct   [ENTROPY_NODES] [2];
+
+                int t = 0;      /* token/prob index */
+
+
+                vp8_tree_probs_from_distribution(
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    cpi->frame_coef_probs [i][j][k],
+                    cpi->frame_branch_ct [i][j][k],
+                    cpi->coef_counts [i][j][k],
+                    256, 1
+                );
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+                    const vp8_prob oldp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+                    const int s = prob_update_savings(ct, oldp, newp, upd);
+
+                    if (s > 0)
+                    {
+                        savings += s;
+                    }
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+    return savings;
+}
+
+int vp8_estimate_entropy_savings(VP8_COMP *cpi)
+{
+    int savings = 0;
+
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+    int new_intra, new_last, gf_last, oldtotal, newtotal;
+    int ref_frame_cost[MAX_REF_FRAMES];
+
+    vp8_clear_system_state(); //__asm emms;
+
+    if (cpi->common.frame_type != KEY_FRAME)
+    {
+        if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter)))
+            new_intra = 1;
+
+        new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+        gf_last = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                  ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+        // new costs
+        ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(new_intra);
+        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(new_intra)
+                                        + vp8_cost_zero(new_last);
+        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(new_intra)
+                                        + vp8_cost_one(new_last)
+                                        + vp8_cost_zero(gf_last);
+        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(new_intra)
+                                        + vp8_cost_one(new_last)
+                                        + vp8_cost_one(gf_last);
+
+        newtotal =
+            rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+            rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+            rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+            rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+
+        // old costs
+        ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
+        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)